Mercurial > repos > mheinzl > variant_analyzer2
diff mut2read.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Fri, 22 Jul 2022 09:19:44 +0000 |
parents | 6ccff403db8a |
children | e46d5e377760 |
line wrap: on
line diff
--- a/mut2read.py Mon Mar 29 09:22:57 2021 +0000 +++ b/mut2read.py Fri Jul 22 09:19:44 2022 +0000 @@ -20,6 +20,7 @@ import argparse import json import os +import re import sys import numpy as np @@ -68,6 +69,7 @@ # get tags tag_dict = {} cvrg_dict = {} + tag_dict_ref = {} for variant in VCF(file1): chrom = variant.CHROM @@ -77,54 +79,108 @@ continue else: alt = variant.ALT[0] + alt = alt.upper() + ref = ref.upper() + if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) + continue chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt dcs_len = [] - if len(ref) == len(alt): - for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): - if pileupcolumn.reference_pos == stop_pos: - count_alt = 0 - count_ref = 0 - count_indel = 0 - count_n = 0 - count_other = 0 - count_lowq = 0 - print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), - "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) - for pileupread in pileupcolumn.pileups: - if not pileupread.is_del and not pileupread.is_refskip: - # query position is None if is_del or is_refskip is set. - nuc = pileupread.alignment.query_sequence[pileupread.query_position] - dcs_len.append(len(pileupread.alignment.query_sequence)) - if nuc == alt: - count_alt += 1 - tag = pileupread.alignment.query_name - if tag in tag_dict: - tag_dict[tag][chrom_stop_pos] = alt - else: - tag_dict[tag] = {} - tag_dict[tag][chrom_stop_pos] = alt - elif nuc == ref: - count_ref += 1 - elif nuc == "N": - count_n += 1 - elif nuc == "lowQ": - count_lowq += 1 + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): + if pileupcolumn.reference_pos == stop_pos: + count_alt = 0 + count_ref = 0 + count_indel = 0 + count_n = 0 + count_other = 0 + count_lowq = 0 + for pileupread in pileupcolumn.pileups: + if not pileupread.is_refskip: + if pileupread.is_del: + p = pileupread.query_position_or_next + e = p + len(alt) - 1 + else: + p = pileupread.query_position + e = p + len(alt) + s = p + split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) + if len(ref) < len(alt): + if "I" in split_cigar: + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): # if position in read matches and length of insertion + nuc = pileupread.alignment.query_sequence[s:e] + break + else: + nuc = pileupread.alignment.query_sequence[s] else: - count_other += 1 - else: - count_indel += 1 - dcs_median = np.median(np.array(dcs_len)) - cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) + nuc = pileupread.alignment.query_sequence[s] + elif len(ref) > len(alt): + ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] + if "D" in split_cigar: + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele + if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): + nuc = pileupread.alignment.query_sequence[s:e] + if nuc == "": + nuc = str(alt) + break + else: + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there + nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] + if nuc.upper() == ref[:len(nuc)]: + nuc = str(ref) + else: + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + else: # SNV: query position is None if is_del or is_refskip is set. + nuc = pileupread.alignment.query_sequence[s] - print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % - (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, - count_indel, count_lowq, dcs_median)) - else: - print("indels are currently not evaluated") + nuc = nuc.upper() + tag = pileupread.alignment.query_name + if "_" in tag: + tag = re.split('_', tag)[0] + + if nuc == alt: + count_alt += 1 + if tag in tag_dict: + tag_dict[tag][chrom_stop_pos] = alt + else: + tag_dict[tag] = {} + tag_dict[tag][chrom_stop_pos] = alt + elif nuc == ref: + count_ref += 1 + if tag in tag_dict_ref: + tag_dict_ref[tag][chrom_stop_pos] = ref + else: + tag_dict_ref[tag] = {} + tag_dict_ref[tag][chrom_stop_pos] = ref + elif nuc == "N": + count_n += 1 + elif nuc == "lowQ": + count_lowq += 1 + else: + count_other += 1 + dcs_len.append(len(pileupread.alignment.query_sequence)) + + dcs_median = np.median(np.array(dcs_len)) + cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) + print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % + (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, + count_indel, count_lowq, dcs_median)) bam.close() with open(json_file, "w") as f: - json.dump((tag_dict, cvrg_dict), f) + json.dump((tag_dict, cvrg_dict, tag_dict_ref), f) # create fastq from aligned reads with open(outfile, 'w') as out: @@ -134,12 +190,11 @@ splits = line.split('\t') tag = splits[0] - if tag in tag_dict: + if tag in tag_dict or tag in tag_dict_ref: str1 = splits[4] curr_seq = str1.replace("-", "") str2 = splits[5] curr_qual = str2.replace(" ", "") - out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") out.write(curr_seq + "\n") out.write("+" + "\n")