diff mut2read.py @ 78:fdfe9a919ff7 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author mheinzl
date Fri, 22 Jul 2022 09:19:44 +0000
parents 6ccff403db8a
children e46d5e377760
line wrap: on
line diff
--- a/mut2read.py	Mon Mar 29 09:22:57 2021 +0000
+++ b/mut2read.py	Fri Jul 22 09:19:44 2022 +0000
@@ -20,6 +20,7 @@
 import argparse
 import json
 import os
+import re
 import sys
 
 import numpy as np
@@ -68,6 +69,7 @@
     # get tags
     tag_dict = {}
     cvrg_dict = {}
+    tag_dict_ref = {}
 
     for variant in VCF(file1):
         chrom = variant.CHROM
@@ -77,54 +79,108 @@
             continue
         else:
             alt = variant.ALT[0]
+        alt = alt.upper()
+        ref = ref.upper()
+        if "N" in alt:  # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV)
+            continue
         chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
         dcs_len = []
-        if len(ref) == len(alt):
-            for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
-                if pileupcolumn.reference_pos == stop_pos:
-                    count_alt = 0
-                    count_ref = 0
-                    count_indel = 0
-                    count_n = 0
-                    count_other = 0
-                    count_lowq = 0
-                    print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
-                          "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
-                    for pileupread in pileupcolumn.pileups:
-                        if not pileupread.is_del and not pileupread.is_refskip:
-                            # query position is None if is_del or is_refskip is set.
-                            nuc = pileupread.alignment.query_sequence[pileupread.query_position]
-                            dcs_len.append(len(pileupread.alignment.query_sequence))
-                            if nuc == alt:
-                                count_alt += 1
-                                tag = pileupread.alignment.query_name
-                                if tag in tag_dict:
-                                    tag_dict[tag][chrom_stop_pos] = alt
-                                else:
-                                    tag_dict[tag] = {}
-                                    tag_dict[tag][chrom_stop_pos] = alt
-                            elif nuc == ref:
-                                count_ref += 1
-                            elif nuc == "N":
-                                count_n += 1
-                            elif nuc == "lowQ":
-                                count_lowq += 1
+        for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
+            if pileupcolumn.reference_pos == stop_pos:
+                count_alt = 0
+                count_ref = 0
+                count_indel = 0
+                count_n = 0
+                count_other = 0
+                count_lowq = 0
+                for pileupread in pileupcolumn.pileups:
+                    if not pileupread.is_refskip:
+                        if pileupread.is_del:
+                            p = pileupread.query_position_or_next
+                            e = p + len(alt) - 1
+                        else:
+                            p = pileupread.query_position
+                            e = p + len(alt)
+                        s = p
+                        split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring)
+                        if len(ref) < len(alt):
+                            if "I" in split_cigar:
+                                all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"]
+                                for ai in all_insertions:  # if multiple insertions in DCS
+                                    ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    ins_count = split_cigar[ai - 1]  # nr of insertions should match with alt allele
+                                    if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count):  # if position in read matches and length of insertion
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s]
                             else:
-                                count_other += 1
-                        else:
-                            count_indel += 1
-                    dcs_median = np.median(np.array(dcs_len))
-                    cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
+                                nuc = pileupread.alignment.query_sequence[s]
+                        elif len(ref) > len(alt):
+                            ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)]
+                            if "D" in split_cigar:
+                                all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"]
+                                for di, ai in enumerate(all_deletions):  # if multiple insertions in DCS
+                                    if di > 0:  # more than 1 deletion, don't count previous deletion to position
+                                        all_deletions_mod = split_cigar[:ai - 1]
+                                        prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")]
+                                        split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx]
+                                        del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()]
+                                    else:  # first deletion in read, sum all previous (mis)matches and insertions to position
+                                        del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()]
+                                    del_count = split_cigar[ai - 1]  # nr of deletions should match with ref allele
+                                    if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count):
+                                        nuc = pileupread.alignment.query_sequence[s:e]
+                                        if nuc == "":
+                                            nuc = str(alt)
+                                        break
+                                    else:
+                                        nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                            elif len(ref_positions) < len(ref):  # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there
+                                nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)]
+                                if nuc.upper() == ref[:len(nuc)]:
+                                    nuc = str(ref)
+                            else:
+                                nuc = pileupread.alignment.query_sequence[s:s + len(ref)]
+                        else:  # SNV: query position is None if is_del or is_refskip is set.
+                            nuc = pileupread.alignment.query_sequence[s]
 
-                    print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
-                          (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
-                           count_indel, count_lowq, dcs_median))
-        else:
-            print("indels are currently not evaluated")
+                        nuc = nuc.upper()
+                        tag = pileupread.alignment.query_name
+                        if "_" in tag:
+                            tag = re.split('_', tag)[0]
+
+                        if nuc == alt:
+                            count_alt += 1
+                            if tag in tag_dict:
+                                tag_dict[tag][chrom_stop_pos] = alt
+                            else:
+                                tag_dict[tag] = {}
+                                tag_dict[tag][chrom_stop_pos] = alt
+                        elif nuc == ref:
+                            count_ref += 1
+                            if tag in tag_dict_ref:
+                                tag_dict_ref[tag][chrom_stop_pos] = ref
+                            else:
+                                tag_dict_ref[tag] = {}
+                                tag_dict_ref[tag][chrom_stop_pos] = ref
+                        elif nuc == "N":
+                            count_n += 1
+                        elif nuc == "lowQ":
+                            count_lowq += 1
+                        else:
+                            count_other += 1
+                        dcs_len.append(len(pileupread.alignment.query_sequence))
+
+                dcs_median = np.median(np.array(dcs_len))
+                cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median)
+                print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" %
+                      (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n,
+                       count_indel, count_lowq, dcs_median))
     bam.close()
 
     with open(json_file, "w") as f:
-        json.dump((tag_dict, cvrg_dict), f)
+        json.dump((tag_dict, cvrg_dict, tag_dict_ref), f)
 
     # create fastq from aligned reads
     with open(outfile, 'w') as out:
@@ -134,12 +190,11 @@
                 splits = line.split('\t')
                 tag = splits[0]
 
-                if tag in tag_dict:
+                if tag in tag_dict or tag in tag_dict_ref:
                     str1 = splits[4]
                     curr_seq = str1.replace("-", "")
                     str2 = splits[5]
                     curr_qual = str2.replace(" ", "")
-
                     out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n")
                     out.write(curr_seq + "\n")
                     out.write("+" + "\n")