diff read2mut.py @ 19:858ca8b7ad43 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Mon, 22 Feb 2021 15:02:21 +0000
parents d910b6dfd826
children ac07ff15dcfd
line wrap: on
line diff
--- a/read2mut.py	Mon Feb 22 14:42:31 2021 +0000
+++ b/read2mut.py	Mon Feb 22 15:02:21 2021 +0000
@@ -16,7 +16,7 @@
 
 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam
                           --inputJson tag_count_dict.json --sscsJson SSCS_counts.json
-                          --outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim 10 --chimera_correction
+                          --outputFile mutant_reads_summary_short_trim.xlsx --thresh 10 --phred 20 --trim5 10 --trim3 10 --chimera_correction
 
 """
 
@@ -59,8 +59,10 @@
                         help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.')
     parser.add_argument('--phred', type=int, default=20,
                         help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.')
-    parser.add_argument('--trim', type=int, default=10,
-                        help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
+    parser.add_argument('--trim5', type=int, default=10,
+                        help='Integer threshold for assigning mutations at start of reads to lower tier. Default 10.')
+    parser.add_argument('--trim3', type=int, default=10,
+                        help='Integer threshold for assigning mutations at end of reads to lower tier. Default 10.')
     parser.add_argument('--chimera_correction', action="store_true",
                         help='Count chimeric variants and correct the variant frequencies')
     return parser
@@ -85,7 +87,8 @@
     outputFile_csv = args.outputFile_csv
     thresh = args.thresh
     phred_score = args.phred
-    trim = args.trim
+    trim5 = args.trim5
+    trim3 = args.trim3
     chimera_correction = args.chimera_correction
 
     if os.path.isfile(file1) is False:
@@ -275,6 +278,7 @@
     counter_tier41 = 0
     counter_tier42 = 0
     counter_tier5 = 0
+    counter_tier6 = 0
     row = 1
     tier_dict = {}
     chimera_dict = {}
@@ -290,7 +294,7 @@
             whole_array = list(pure_tags_dict_short[key1].keys())
 
             tier_dict[key1] = {}
-            values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
+            values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0), ("tier 6", 0)]
             for k, v in values_tier_dict:
                 tier_dict[key1][k] = v
 
@@ -508,7 +512,8 @@
                         details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                         details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 
-                        trimmed = False
+                        trimmed_five = False
+                        trimmed_three = False
                         contradictory = False
 
                         if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
@@ -516,36 +521,65 @@
                             alt4ff = 0
                             alt2ff = 0
                             alt3ff = 0
-                            trimmed = False
+                            trimmed_five = False
+                            trimmed_three = False
                             contradictory = True
                         else:
-                            if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
+                            if ((read_pos1 >= 0) and (read_pos1 <= trim5)):
                                 beg1 = total1new
                                 total1new = 0
                                 alt1ff = 0
                                 alt1f = 0
-                                trimmed = True
+                                trimmed_five = True
+                            if ((read_pos1 >= 0) and (abs(read_len_median1 - read_pos1) <= trim3)):
+                                beg1 = total1new
+                                total1new = 0
+                                alt1ff = 0
+                                alt1f = 0
+                                trimmed_three = True
 
-                            if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
+                            if ((read_pos4 >= 0) and (read_pos4 <= trim5)):
+                                beg4 = total4new
+                                total4new = 0
+                                alt4ff = 0
+                                alt4f = 0
+                                trimmed_five = True
+
+                            if ((read_pos4 >= 0) and (abs(read_len_median4 - read_pos4) <= trim3)):
                                 beg4 = total4new
                                 total4new = 0
                                 alt4ff = 0
                                 alt4f = 0
-                                trimmed = True
+                                trimmed_three = True
 
-                            if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
+                            if ((read_pos2 >= 0) and (read_pos2 <= trim5)):
+                                beg2 = total2new
+                                total2new = 0
+                                alt2ff = 0
+                                alt2f = 0
+                                trimmed_five = True
+
+                            if ((read_pos2 >= 0) and (abs(read_len_median2 - read_pos2) <= trim3)):
                                 beg2 = total2new
                                 total2new = 0
                                 alt2ff = 0
                                 alt2f = 0
-                                trimmed = True
+                                trimmed_three = True
 
-                            if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
+                            if ((read_pos3 >= 0) and (read_pos3 <= trim5)):
                                 beg3 = total3new
                                 total3new = 0
                                 alt3ff = 0
                                 alt3f = 0
-                                trimmed = True
+                                trimmed_five = True
+
+                            if ((read_pos3 >= 0) and (abs(read_len_median3 - read_pos3) <= trim3)):
+                                beg3 = total3new
+                                total3new = 0
+                                alt3ff = 0
+                                alt3f = 0
+                                trimmed_three = True
+
                             details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
                             details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
 
@@ -595,20 +629,24 @@
                             counter_tier32 += 1
                             tier_dict[key1]["tier 3.2"] += 1
 
-                        elif (trimmed):
+                        elif trimmed_five:
                             tier = "4.1"
                             counter_tier41 += 1
                             tier_dict[key1]["tier 4.1"] += 1
 
-                        elif (contradictory):
+                        elif trimmed_three:
                             tier = "4.2"
                             counter_tier42 += 1
                             tier_dict[key1]["tier 4.2"] += 1
 
-                        else:
+                        elif contradictory:
                             tier = "5"
                             counter_tier5 += 1
                             tier_dict[key1]["tier 5"] += 1
+                        else:
+                            tier = "6"
+                            counter_tier6 += 1
+                            tier_dict[key1]["tier 6"] += 1
 
                         chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
                         var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
@@ -730,13 +768,13 @@
     if chimera_correction:
         header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
                         'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-                        'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
+                        'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6')
     else:
         header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
                         'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
-                        'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
-                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
+                        'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+                        'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6')
 
     ws2.write_row(0, 0, header_line2)
     row = 0
@@ -799,7 +837,7 @@
     sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
               ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
               ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
-              ("tier 4.2", counter_tier42), ("tier 5", counter_tier5)]
+              ("tier 4.2", counter_tier42), ("tier 5", counter_tier5), ("tier 6", counter_tier6)]
 
     header = ("tier", "count")
     ws3.write_row(0, 0, header)
@@ -819,7 +857,18 @@
                                 'criteria': '=$A${}>="3"'.format(i + 2),
                                 'format': format2})
 
-    description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]
+    description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), 
+                         ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), 
+                         ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), 
+                         ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), 
+                         ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), 
+                         ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), 
+                         ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), 
+                         ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), 
+                         ("Tier 4.1", "variants at the start of the reads"),
+                         ("Tier 4.2", "variants at the end of the reads"), 
+                         ("Tier 5", "mates with contradictory information"), 
+                         ("Tier 6", "remaining variants")]
     examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
                         "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
                         "4081", "4098", "5", "10", "", ""),
@@ -874,18 +923,23 @@
                        ("", "", "ACAACATCACGTATTCAGGT", "ab2.ba1", "35", "35", "240", "258", "271",
                         "2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
                         "0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", "")],
-                      [("Chr5:5-20000-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "0", "100", "255", "276", "269",
+                      [("Chr5:5-20000-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "1", "5", "255", "276", "269",
                         "5", "6", "0", "6", "0", "0", "5", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
                         "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
                         "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-                      [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
+                      [("Chr5:5-20000-13983-G-C", "4.2", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "250", "270", "255", "276", "269",
+                        "5", "6", "0", "6", "0", "0", "5", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
+                       ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
+                        "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
+                        "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
+                      [("Chr5:5-20000-13963-T-C", "5", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
                         "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
                         "0", "0", "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
                         "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
                         "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
-                      [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
+                      [("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
                         "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
                         "0", "1", "1", "5348", "5350", "", ""),
                        ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,