# HG changeset patch # User mheinzl # Date 1658481584 0 # Node ID fdfe9a919ff77fabf6f9001e95ff230a0c04025a # Parent 1797e461d674925776920c3ff12daf51e5eb7f7f planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty diff -r 1797e461d674 -r fdfe9a919ff7 mut2read.py --- a/mut2read.py Mon Mar 29 09:22:57 2021 +0000 +++ b/mut2read.py Fri Jul 22 09:19:44 2022 +0000 @@ -20,6 +20,7 @@ import argparse import json import os +import re import sys import numpy as np @@ -68,6 +69,7 @@ # get tags tag_dict = {} cvrg_dict = {} + tag_dict_ref = {} for variant in VCF(file1): chrom = variant.CHROM @@ -77,54 +79,108 @@ continue else: alt = variant.ALT[0] + alt = alt.upper() + ref = ref.upper() + if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) + continue chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt dcs_len = [] - if len(ref) == len(alt): - for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): - if pileupcolumn.reference_pos == stop_pos: - count_alt = 0 - count_ref = 0 - count_indel = 0 - count_n = 0 - count_other = 0 - count_lowq = 0 - print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), - "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) - for pileupread in pileupcolumn.pileups: - if not pileupread.is_del and not pileupread.is_refskip: - # query position is None if is_del or is_refskip is set. - nuc = pileupread.alignment.query_sequence[pileupread.query_position] - dcs_len.append(len(pileupread.alignment.query_sequence)) - if nuc == alt: - count_alt += 1 - tag = pileupread.alignment.query_name - if tag in tag_dict: - tag_dict[tag][chrom_stop_pos] = alt - else: - tag_dict[tag] = {} - tag_dict[tag][chrom_stop_pos] = alt - elif nuc == ref: - count_ref += 1 - elif nuc == "N": - count_n += 1 - elif nuc == "lowQ": - count_lowq += 1 + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): + if pileupcolumn.reference_pos == stop_pos: + count_alt = 0 + count_ref = 0 + count_indel = 0 + count_n = 0 + count_other = 0 + count_lowq = 0 + for pileupread in pileupcolumn.pileups: + if not pileupread.is_refskip: + if pileupread.is_del: + p = pileupread.query_position_or_next + e = p + len(alt) - 1 + else: + p = pileupread.query_position + e = p + len(alt) + s = p + split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) + if len(ref) < len(alt): + if "I" in split_cigar: + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): # if position in read matches and length of insertion + nuc = pileupread.alignment.query_sequence[s:e] + break + else: + nuc = pileupread.alignment.query_sequence[s] else: - count_other += 1 - else: - count_indel += 1 - dcs_median = np.median(np.array(dcs_len)) - cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) + nuc = pileupread.alignment.query_sequence[s] + elif len(ref) > len(alt): + ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] + if "D" in split_cigar: + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele + if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): + nuc = pileupread.alignment.query_sequence[s:e] + if nuc == "": + nuc = str(alt) + break + else: + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there + nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] + if nuc.upper() == ref[:len(nuc)]: + nuc = str(ref) + else: + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + else: # SNV: query position is None if is_del or is_refskip is set. + nuc = pileupread.alignment.query_sequence[s] - print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % - (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, - count_indel, count_lowq, dcs_median)) - else: - print("indels are currently not evaluated") + nuc = nuc.upper() + tag = pileupread.alignment.query_name + if "_" in tag: + tag = re.split('_', tag)[0] + + if nuc == alt: + count_alt += 1 + if tag in tag_dict: + tag_dict[tag][chrom_stop_pos] = alt + else: + tag_dict[tag] = {} + tag_dict[tag][chrom_stop_pos] = alt + elif nuc == ref: + count_ref += 1 + if tag in tag_dict_ref: + tag_dict_ref[tag][chrom_stop_pos] = ref + else: + tag_dict_ref[tag] = {} + tag_dict_ref[tag][chrom_stop_pos] = ref + elif nuc == "N": + count_n += 1 + elif nuc == "lowQ": + count_lowq += 1 + else: + count_other += 1 + dcs_len.append(len(pileupread.alignment.query_sequence)) + + dcs_median = np.median(np.array(dcs_len)) + cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) + print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % + (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, + count_indel, count_lowq, dcs_median)) bam.close() with open(json_file, "w") as f: - json.dump((tag_dict, cvrg_dict), f) + json.dump((tag_dict, cvrg_dict, tag_dict_ref), f) # create fastq from aligned reads with open(outfile, 'w') as out: @@ -134,12 +190,11 @@ splits = line.split('\t') tag = splits[0] - if tag in tag_dict: + if tag in tag_dict or tag in tag_dict_ref: str1 = splits[4] curr_seq = str1.replace("-", "") str2 = splits[5] curr_qual = str2.replace(" ", "") - out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") out.write(curr_seq + "\n") out.write("+" + "\n") diff -r 1797e461d674 -r fdfe9a919ff7 mut2read.xml --- a/mut2read.xml Mon Mar 29 09:22:57 2021 +0000 +++ b/mut2read.xml Fri Jul 22 09:19:44 2022 +0000 @@ -1,5 +1,5 @@ - + Extracts all tags that carry a mutation in the duplex consensus sequence (DCS) va_macros.xml @@ -17,7 +17,7 @@ ]]> - + @@ -27,11 +27,11 @@ - + - - + + `_ or `LoFreq `_ variant caller. @@ -57,7 +57,7 @@ **Output** -The output is a json file containing dictonaries of the tags of reads containing mutations +The output is a json file containing dictionaries of the tags of reads containing mutations in the DCS and a fastq file of all reads of these tags. ]]> diff -r 1797e461d674 -r fdfe9a919ff7 mut2sscs.py --- a/mut2sscs.py Mon Mar 29 09:22:57 2021 +0000 +++ b/mut2sscs.py Fri Jul 22 09:19:44 2022 +0000 @@ -23,6 +23,7 @@ import argparse import json import os +import re import sys import numpy as np @@ -71,60 +72,111 @@ continue else: alt = variant.ALT[0] + alt = alt.upper() + ref = ref.upper() + if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) + continue chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt - if len(ref) == len(alt): - for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): - if pileupcolumn.reference_pos == stop_pos: - count_alt = 0 - count_ref = 0 - count_indel = 0 - print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), - "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) - for pileupread in pileupcolumn.pileups: - if not pileupread.is_del and not pileupread.is_refskip: - tag = pileupread.alignment.query_name - abba = tag[-2:] - # query position is None if is_del or is_refskip is set. - if pileupread.alignment.query_sequence[pileupread.query_position] == alt: - count_alt += 1 - if chrom_stop_pos in mut_pos_dict: - if abba in mut_pos_dict[chrom_stop_pos]: - mut_pos_dict[chrom_stop_pos][abba] += 1 + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): + if pileupcolumn.reference_pos == stop_pos: + count_alt = 0 + count_ref = 0 + count_indel = 0 + for pileupread in pileupcolumn.pileups: + if not pileupread.is_refskip: + tag = pileupread.alignment.query_name + abba = tag[-2:] + if pileupread.is_del: + p = pileupread.query_position_or_next + e = p + len(alt) - 1 + else: + p = pileupread.query_position + e = p + len(alt) + tag = pileupread.alignment.query_name + if "_" in tag: + tag = re.split('_', tag)[0] + s = p + split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) + if len(ref) < len(alt): # insertion + if "I" in split_cigar: + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): + nuc = pileupread.alignment.query_sequence[s:e] + break else: - mut_pos_dict[chrom_stop_pos][abba] = 1 - else: - mut_pos_dict[chrom_stop_pos] = {} - mut_pos_dict[chrom_stop_pos][abba] = 1 - if chrom_stop_pos not in ref_pos_dict: - ref_pos_dict[chrom_stop_pos] = {} - ref_pos_dict[chrom_stop_pos][abba] = 0 + nuc = pileupread.alignment.query_sequence[s] + else: + nuc = pileupread.alignment.query_sequence[s] + elif len(ref) > len(alt): # deletion + ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] + if "D" in split_cigar: + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele + if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): + nuc = pileupread.alignment.query_sequence[s:e] + if nuc == "": + nuc = str(alt) + break + else: + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there + nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] + if nuc.upper() == ref[:len(nuc)]: + nuc = str(ref) + else: + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + else: # SNV: query position is None if is_del or is_refskip is set. + nuc = pileupread.alignment.query_sequence[s] - elif pileupread.alignment.query_sequence[pileupread.query_position] == ref: - count_ref += 1 - if chrom_stop_pos in ref_pos_dict: - if abba in ref_pos_dict[chrom_stop_pos]: - ref_pos_dict[chrom_stop_pos][abba] += 1 - else: - ref_pos_dict[chrom_stop_pos][abba] = 1 + nuc = nuc.upper() + + if nuc == alt: + count_alt += 1 + if chrom_stop_pos in mut_pos_dict: + if abba in mut_pos_dict[chrom_stop_pos]: + mut_pos_dict[chrom_stop_pos][abba] += 1 else: - ref_pos_dict[chrom_stop_pos] = {} + mut_pos_dict[chrom_stop_pos][abba] = 1 + else: + mut_pos_dict[chrom_stop_pos] = {} + mut_pos_dict[chrom_stop_pos][abba] = 1 + if chrom_stop_pos not in ref_pos_dict: + ref_pos_dict[chrom_stop_pos] = {} + ref_pos_dict[chrom_stop_pos][abba] = 0 + elif nuc == ref: + count_ref += 1 + if chrom_stop_pos in ref_pos_dict: + if abba in ref_pos_dict[chrom_stop_pos]: + ref_pos_dict[chrom_stop_pos][abba] += 1 + else: ref_pos_dict[chrom_stop_pos][abba] = 1 else: - count_indel += 1 - - print("coverage at pos %s = %s, ref = %s, alt = %s, indel = %s,\n" % - (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) + ref_pos_dict[chrom_stop_pos] = {} + ref_pos_dict[chrom_stop_pos][abba] = 1 + else: + count_indel += 1 + print("coverage at pos %s = %s, ref = %s, alt = %s, other = %s,\n" % + (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_indel)) - # if mutation is in DCS file but not in SSCS, then set counts to NA - if chrom_stop_pos not in mut_pos_dict.keys(): - mut_pos_dict[chrom_stop_pos] = {} - mut_pos_dict[chrom_stop_pos]["ab"] = 0 - mut_pos_dict[chrom_stop_pos]["ba"] = 0 - ref_pos_dict[chrom_stop_pos] = {} - ref_pos_dict[chrom_stop_pos]["ab"] = 0 - ref_pos_dict[chrom_stop_pos]["ba"] = 0 - else: - print("indels are currently not evaluated") + # if mutation is in DCS file but not in SSCS, then set counts to NA + if chrom_stop_pos not in mut_pos_dict.keys(): + mut_pos_dict[chrom_stop_pos] = {} + mut_pos_dict[chrom_stop_pos]["ab"] = 0 + mut_pos_dict[chrom_stop_pos]["ba"] = 0 + ref_pos_dict[chrom_stop_pos] = {} + ref_pos_dict[chrom_stop_pos]["ab"] = 0 + ref_pos_dict[chrom_stop_pos]["ba"] = 0 bam.close() # save counts diff -r 1797e461d674 -r fdfe9a919ff7 mut2sscs.xml --- a/mut2sscs.xml Mon Mar 29 09:22:57 2021 +0000 +++ b/mut2sscs.xml Fri Jul 22 09:19:44 2022 +0000 @@ -1,6 +1,6 @@ - - Extracts all tags from the single stranded consensus sequence (SSCS) bam file that carry a mutation at the same position a mutation is called in the duplex consensus sequence (DCS) and calculates their frequencies + + Extracts all tags from the single-stranded consensus sequence (SSCS) bam file that carries a mutation at the same position a mutation is called in the duplex consensus sequence (DCS) and calculates their frequencies va_macros.xml @@ -15,7 +15,7 @@ ]]> - + @@ -25,29 +25,29 @@ - + `_ or `LoFreq `_ variant caller. -**Dataset 2:** BAM file of aligned single stranded consensus sequence (SSCS) +**Dataset 2:** BAM file of the aligned single stranded consensus sequence (SSCS) reads. This file can be obtained by the tool `Map with BWA-MEM `_. **Output** -The output is a json file containing dictonaries with stats of tags that carry a mutation in the SSCS -at the same position a mutation is called in the DCS. +The output is a json file containing dictionaries with stats of tags that carry a mutation in the SSCS +at the same position, a mutation is called in the DCS. ]]> diff -r 1797e461d674 -r fdfe9a919ff7 read2mut.py --- a/read2mut.py Mon Mar 29 09:22:57 2021 +0000 +++ b/read2mut.py Fri Jul 22 09:19:44 2022 +0000 @@ -87,6 +87,7 @@ outfile2 = args.outputFile2 outfile3 = args.outputFile3 outputFile_csv = args.outputFile_csv + thresh = args.thresh phred_score = args.phred trim = args.trim @@ -111,7 +112,7 @@ # load dicts with open(json_file, "r") as f: - (tag_dict, cvrg_dict) = json.load(f) + (tag_dict, cvrg_dict, tag_dict_ref) = json.load(f) with open(sscs_json, "r") as f: (mut_pos_dict, ref_pos_dict) = json.load(f) @@ -138,106 +139,207 @@ continue else: alt = variant.ALT[0] - chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt + + alt = alt.upper() + ref = ref.upper() + if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) + continue - if len(ref) == len(alt): - mut_array.append([chrom, stop_pos, ref, alt]) - i += 1 - mut_dict[chrom_stop_pos] = {} - mut_read_pos_dict[chrom_stop_pos] = {} - reads_dict[chrom_stop_pos] = {} - mut_read_cigar_dict[chrom_stop_pos] = {} - real_start_end[chrom_stop_pos] = {} + chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt + mut_array.append([chrom, stop_pos, ref, alt]) + i += 1 + mut_dict[chrom_stop_pos] = {} + mut_read_pos_dict[chrom_stop_pos] = {} + reads_dict[chrom_stop_pos] = {} + mut_read_cigar_dict[chrom_stop_pos] = {} + real_start_end[chrom_stop_pos] = {} + for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): + if pileupcolumn.reference_pos == stop_pos: + count_alt = 0 + count_ref = 0 + count_indel = 0 + count_n = 0 + count_other = 0 + count_lowq = 0 + n = 0 + for pileupread in pileupcolumn.pileups: + n += 1 + if not pileupread.is_refskip: + if pileupread.is_del: + p = pileupread.query_position_or_next + e = p + len(alt) - 1 + else: + p = pileupread.query_position + e = p + len(alt) + s = p + tag = pileupread.alignment.query_name + split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) - for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): - if pileupcolumn.reference_pos == stop_pos: - count_alt = 0 - count_ref = 0 - count_indel = 0 - count_n = 0 - count_other = 0 - count_lowq = 0 - n = 0 - for pileupread in pileupcolumn.pileups: - n += 1 - if not pileupread.is_del and not pileupread.is_refskip: - tag = pileupread.alignment.query_name - nuc = pileupread.alignment.query_sequence[pileupread.query_position] - phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 - # if read is softclipped, store real position in reference - if "S" in pileupread.alignment.cigarstring: - # spftclipped at start - if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): - start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) - end = pileupread.alignment.reference_end - # softclipped at end - elif re.search(r"S$", pileupread.alignment.cigarstring): - end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) - start = pileupread.alignment.reference_start - else: + if len(ref) < len(alt): + if "I" in split_cigar: + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) >= len(alt) - 1: # if pe read matches exatcly to insertion + nuc = pileupread.alignment.query_sequence[s:e] + phred = ord(pileupread.alignment.qual[s]) - 33 + break + elif "I" in split_cigar and sum(ins_index) == p + 1 and int(ins_count) < len(alt) - 1: # if pe read has shorter insertion -- not alt + nuc = pileupread.alignment.query_sequence[s:s+int(ins_count)+1] + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: # insertion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + elif "D" in split_cigar: # if deletion in pe read, don't count + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + if "D" in split_cigar and sum(del_index) == p + 1: # if deletion at that position + nuc = "D" + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + else: # insertion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + + elif len(ref) > len(alt): + ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] + if "D" in split_cigar: + all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] + for di, ai in enumerate(all_deletions): # if multiple insertions in DCS + if di > 0: # more than 1 deletion, don't count previous deletion to position + all_deletions_mod = split_cigar[:ai - 1] + prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] + split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] + del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] + else: # first deletion in read, sum all previous (mis)matches and insertions to position + del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + del_count = split_cigar[ai - 1] # deletion on that position but does not necesserily match in the length + if "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) >= len(ref) - 1: # if pe read matches exatcly to deletion + nuc = pileupread.alignment.query_sequence[s:e] + phred = ord(pileupread.alignment.qual[s]) - 33 + if nuc == "": + nuc = str(alt) + break + elif "D" in split_cigar and sum(del_index) == p + 1 and int(del_count) < len(ref) - 1: # if pe read has shorter deletion --> not alt + nuc = str(ref)[:int(del_count)+1] + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: # deletion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + phred = ord(pileupread.alignment.qual[s]) - 33 + elif "I" in split_cigar: # if pe read has insertion --> not count + all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] + for ai in all_insertions: # if multiple insertions in DCS + ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] + ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele + if "I" in split_cigar and sum(ins_index) == p + 1: + nuc = "I" + phred = ord(pileupread.alignment.qual[s]) - 33 + break + else: + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there + nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] + phred = ord(pileupread.alignment.qual[s]) - 33 + if nuc.upper() == ref[:len(nuc)]: + nuc = str(ref) + else: # deletion in pe reads but not at the desired position + nuc = pileupread.alignment.query_sequence[s:s + len(ref)] + phred = ord(pileupread.alignment.qual[s]) - 33 + else: # SNV: query position is None if is_del or is_refskip is set. + nuc = pileupread.alignment.query_sequence[s] + phred = ord(pileupread.alignment.qual[s]) - 33 + + nuc = nuc.upper() + + # if read is softclipped, store real position in reference + if "S" in pileupread.alignment.cigarstring: + # spftclipped at start + if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): + start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) end = pileupread.alignment.reference_end + # softclipped at end + elif re.search(r"S$", pileupread.alignment.cigarstring): + end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) start = pileupread.alignment.reference_start - - if phred < phred_score: - nuc = "lowQ" - if tag not in mut_dict[chrom_stop_pos]: - mut_dict[chrom_stop_pos][tag] = {} - if nuc in mut_dict[chrom_stop_pos][tag]: - mut_dict[chrom_stop_pos][tag][nuc] += 1 - else: - mut_dict[chrom_stop_pos][tag][nuc] = 1 - if tag not in mut_read_pos_dict[chrom_stop_pos]: - mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] - reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] - mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] - real_start_end[chrom_stop_pos][tag] = [(start, end)] + else: + end = pileupread.alignment.reference_end + start = pileupread.alignment.reference_start + if phred < phred_score: + nuc = "lowQ" + if tag not in mut_dict[chrom_stop_pos]: + mut_dict[chrom_stop_pos][tag] = {} + if nuc in mut_dict[chrom_stop_pos][tag]: + mut_dict[chrom_stop_pos][tag][nuc] += 1 + else: + mut_dict[chrom_stop_pos][tag][nuc] = 1 + if tag not in mut_read_pos_dict[chrom_stop_pos]: + mut_read_pos_dict[chrom_stop_pos][tag] = [s + 1] + reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] + mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] + real_start_end[chrom_stop_pos][tag] = [(start, end)] + else: + mut_read_pos_dict[chrom_stop_pos][tag].append(s + 1) + reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) + mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) + real_start_end[chrom_stop_pos][tag].append((start, end)) + if nuc == alt: + count_alt += 1 + if tag not in mut_read_dict: + mut_read_dict[tag] = {} + mut_read_dict[tag][chrom_stop_pos] = (alt, ref) else: - mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) - reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) - mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) - real_start_end[chrom_stop_pos][tag].append((start, end)) - if nuc == alt: - count_alt += 1 - if tag not in mut_read_dict: - mut_read_dict[tag] = {} - mut_read_dict[tag][chrom_stop_pos] = (alt, ref) - else: - mut_read_dict[tag][chrom_stop_pos] = (alt, ref) - elif nuc == ref: - count_ref += 1 - elif nuc == "N": - count_n += 1 - elif nuc == "lowQ": - count_lowq += 1 - else: - count_other += 1 + mut_read_dict[tag][chrom_stop_pos] = (alt, ref) + elif nuc == ref: + count_ref += 1 + elif nuc == "N": + count_n += 1 + elif nuc == "lowQ": + count_lowq += 1 else: - count_indel += 1 + count_other += 1 + else: + count_indel += 1 mut_array = np.array(mut_array) for read in bam.fetch(until_eof=True): if read.is_unmapped: pure_tag = read.query_name[:-5] nuc = "na" - for key in tag_dict[pure_tag].keys(): - if key not in mut_dict: - mut_dict[key] = {} - if read.query_name not in mut_dict[key]: - mut_dict[key][read.query_name] = {} - if nuc in mut_dict[key][read.query_name]: - mut_dict[key][read.query_name][nuc] += 1 - else: - mut_dict[key][read.query_name][nuc] = 1 + if pure_tag in tag_dict.keys(): # stored all ref and alt reads --> get only alt reads + for key in tag_dict[pure_tag].keys(): + if key not in mut_dict: + mut_dict[key] = {} + if read.query_name not in mut_dict[key]: + mut_dict[key][read.query_name] = {} + if nuc in mut_dict[key][read.query_name]: + mut_dict[key][read.query_name][nuc] += 1 + else: + mut_dict[key][read.query_name][nuc] = 1 bam.close() - # create pure_tags_dict pure_tags_dict = {} + pure_tags_dict_ref = {} for key1, value1 in sorted(mut_dict.items()): i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] ref = mut_array[i, 2] alt = mut_array[i, 3] pure_tags_dict[key1] = {} + pure_tags_dict_ref[key1] = {} for key2, value2 in sorted(value1.items()): for key3, value3 in value2.items(): pure_tag = key2[:-5] @@ -247,6 +349,12 @@ else: pure_tags_dict[key1][pure_tag] = 1 + if key3 == ref: + if pure_tag in pure_tags_dict_ref[key1]: + pure_tags_dict_ref[key1][pure_tag] += 1 + else: + pure_tags_dict_ref[key1][pure_tag] = 1 + # create pure_tags_dict_short with thresh if thresh > 0: pure_tags_dict_short = {} @@ -279,7 +387,7 @@ format23 = workbook3.add_format({'bg_color': '#FFC7CE'}) # red format33 = workbook3.add_format({'bg_color': '#FACC2E'}) # yellow - header_line = ('variant ID', 'tier', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab', + header_line = ('variant ID', 'tier', 'allele', 'tag', 'mate', 'read pos.ab', 'read pos.ba', 'read median length.ab', 'read median length.ba', 'DCS median length', 'FS.ab', 'FS.ba', 'FSqc.ab', 'FSqc.ba', 'ref.ab', 'ref.ba', 'alt.ab', 'alt.ba', 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba', @@ -288,6 +396,7 @@ 'in phase', 'chimeric tag') ws1.write_row(0, 0, header_line) csv_writer.writerow(header_line) + counter_tier11 = 0 counter_tier12 = 0 counter_tier21 = 0 @@ -308,12 +417,14 @@ row = 1 tier_dict = {} + tier_dict_ref = {} chimera_dict = {} for key1, value1 in sorted(mut_dict.items()): counts_mut = 0 chimeric_tag_list = [] chimeric_tag = {} - if key1 in pure_tags_dict_short.keys(): + if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()): # ref or alt + change_tier_after_print = [] i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] @@ -323,11 +434,13 @@ whole_array = list(pure_tags_dict_short[key1].keys()) tier_dict[key1] = {} + tier_dict_ref[key1] = {} values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), ("tier 6", 0), ("tier 7", 0)] for k, v in values_tier_dict: tier_dict[key1][k] = v + tier_dict_ref[key1][k] = v used_keys = [] if 'ab' in mut_pos_dict[key1].keys(): @@ -349,7 +462,16 @@ for key2, value2 in sorted(value1.items()): add_mut14 = "" add_mut23 = "" - if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()): + if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys(): # skip reads that have not alt or ref + continue + + if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys): + + if key2[:-5] in pure_tags_dict_short[key1].keys(): + variant_type = "alt" + elif key2[:-5] in pure_tags_dict_ref[key1].keys(): + variant_type = "ref" + if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): @@ -550,36 +672,59 @@ used_keys.append(key2[:-5]) counts_mut += 1 - if (alt1f + alt2f + alt3f + alt4f) > 0.5: + if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)): + if variant_type == "alt": + tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f + tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f + elif variant_type == "ref": + tier1ff, tier2ff, tier3ff, tier4ff = ref1f, ref2f, ref3f, ref4f + tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = ref1f, ref2f, ref3f, ref4f + total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new if total1new == 0: ref1f = alt1f = None alt1ff = -1 alt1ff_trim = -1 + tier1ff = -1 + tier1ff_trim = -1 else: alt1ff = alt1f alt1ff_trim = alt1f + tier1ff = tier1ff + tier1ff_trim = tier1ff_trim if total2new == 0: ref2f = alt2f = None alt2ff = -1 alt2ff_trim = -1 + tier2ff = -1 + tier2ff_trim = -1 else: alt2ff = alt2f alt2ff_trim = alt2f + tier2ff = tier2ff + tier2ff_trim = tier2ff_trim if total3new == 0: ref3f = alt3f = None alt3ff = -1 alt3ff_trim = -1 + tier3ff = -1 + tier3ff_trim = -1 else: alt3ff = alt3f alt3ff_trim = alt3f + tier3ff = tier3ff + tier3ff_trim = tier3ff_trim if total4new == 0: ref4f = alt4f = None alt4ff = -1 alt4ff_trim = -1 + tier4ff = -1 + tier4ff_trim = -1 else: alt4ff = alt4f alt4ff_trim = alt4f + tier4ff = tier4ff + tier4ff_trim = tier4ff_trim beg1 = beg4 = beg2 = beg3 = 0 @@ -605,7 +750,7 @@ # mate 1 - SSCS ab softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] safe_div_result = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) - if (safe_div_result == None): + if (safe_div_result is None): ratio1 = False else: ratio1 = safe_div_result >= threshold_reads @@ -629,7 +774,7 @@ # mate 1 - SSCS ba softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] safe_div_result = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) - if (safe_div_result == None): + if (safe_div_result is None): ratio4 = False else: ratio4 = safe_div_result >= threshold_reads @@ -653,7 +798,7 @@ # mate 2 - SSCS ab softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] safe_div_result = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) - if (safe_div_result == None): + if (safe_div_result is None): ratio2 = False else: ratio2 = safe_div_result >= threshold_reads @@ -677,7 +822,7 @@ # mate 2 - SSCS ba softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] safe_div_result = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) - if (safe_div_result == None): + if (safe_div_result is None): ratio3 = False else: ratio3 = safe_div_result >= threshold_reads @@ -698,21 +843,21 @@ ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads - if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant - all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | - (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & - all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + if ((all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) & # contradictory variant + all(float(ij) == 0. for ij in [tier2ff, tier3ff])) | + (all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]) & + all(float(ij) == 0. for ij in [tier1ff, tier4ff]))): + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = True # softclipping tiers # information of both mates available --> all reads for both mates and SSCS are softclipped elif (ratio1 & ratio4 & ratio2 & ratio3 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = True softclipped_mutation_oneOfTwoMates = False @@ -720,16 +865,16 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = False - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False # information of both mates available --> only one mate softclipped elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue @@ -763,10 +908,10 @@ read_len_median3 = read_len_median3 - n_spacer_barcode else: softclipped_mutation_oneOfTwoMates = True - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False softclipped_mutation_allMates = False @@ -780,6 +925,7 @@ total1new = 0 alt1ff = 0 alt1f = 0 + tier1ff = 0 trimmed = True if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): @@ -787,6 +933,7 @@ total4new = 0 alt4ff = 0 alt4f = 0 + tier4ff = 0 trimmed = True if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): @@ -794,6 +941,7 @@ total2new = 0 alt2ff = 0 alt2f = 0 + tier2ff = 0 trimmed = True if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): @@ -801,6 +949,7 @@ total3new = 0 alt3ff = 0 alt3f = 0 + tier3ff = 0 trimmed = True details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) @@ -808,7 +957,7 @@ # information of both mates available --> only one mate softclipped elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + all(float(ij) > 0. for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False @@ -816,18 +965,18 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = False - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False # information of one mate available --> all reads of one mate are softclipped elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & - all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | + all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff])) | (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & - all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available + all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) > 0. for ij in [tier2ff, tier3ff]))): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False @@ -835,17 +984,17 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = True softclipped_mutation_oneMateOneSSCS = False - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False # information of one mate available --> only one SSCS is softclipped elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & - (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | + (all(float(ij) < 0. for ij in [tier2ff, tier3ff]) & all(float(ij) > 0. for ij in [tier1ff, tier4ff]))) | (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & - (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available + (all(float(ij) < 0. for ij in [tier1ff, tier4ff]) & all(float(ij) < 0. for ij in [tier2ff, tier3ff])))): # all mates available # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False @@ -853,10 +1002,10 @@ softclipped_mutation_oneOfTwoSSCS_diffMates = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = True - alt1ff = 0 - alt4ff = 0 - alt2ff = 0 - alt3ff = 0 + tier1ff = 0 + tier4ff = 0 + tier2ff = 0 + tier3ff = 0 trimmed = False contradictory = False @@ -866,6 +1015,7 @@ total1new = 0 alt1ff = 0 alt1f = 0 + tier1ff = 0 trimmed = True if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): @@ -873,6 +1023,7 @@ total4new = 0 alt4ff = 0 alt4f = 0 + tier4ff = 0 trimmed = True if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): @@ -880,6 +1031,7 @@ total2new = 0 alt2ff = 0 alt2f = 0 + tier2ff = 0 trimmed = True if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): @@ -887,117 +1039,145 @@ total3new = 0 alt3ff = 0 alt3f = 0 + tier3ff = 0 trimmed = True details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) # assign tiers if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): tier = "1.1" counter_tier11 += 1 - tier_dict[key1]["tier 1.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 1.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 1.1"] += 1 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): + all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): tier = "1.2" counter_tier12 += 1 - tier_dict[key1]["tier 1.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 1.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 1.2"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): tier = "2.1" counter_tier21 += 1 - tier_dict[key1]["tier 2.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.1"] += 1 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): + all(float(ij) >= 0.75 for ij in [tier1ff, tier2ff, tier3ff, tier4ff])): tier = "2.2" counter_tier22 += 1 - tier_dict[key1]["tier 2.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.2"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & - any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]) & + any(float(ij) >= 0.75 for ij in [tier2ff, tier3ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & - any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]) & + any(float(ij) >= 0.75 for ij in [tier1ff, tier4ff]))): tier = "2.3" counter_tier23 += 1 - tier_dict[key1]["tier 2.3"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.3"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.3"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & - all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.75 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & - all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.75 for ij in [tier2ff, tier3ff]))): tier = "2.4" counter_tier24 += 1 - tier_dict[key1]["tier 2.4"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 2.4"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 2.4"] += 1 elif ((len(pure_tags_dict_short[key1]) > 1) & - (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | - all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): + (all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff]) | + all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))): tier = "3.1" counter_tier31 += 1 - tier_dict[key1]["tier 3.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 3.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 3.1"] += 1 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & - all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) | + all(float(ij) >= 0.5 for ij in [tier1ff, tier4ff])) | (all(int(ij) >= 1 for ij in [total2new, total3new]) & - all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): + all(float(ij) >= 0.5 for ij in [tier2ff, tier3ff]))): tier = "3.2" counter_tier32 += 1 - tier_dict[key1]["tier 3.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 3.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 3.2"] += 1 elif (trimmed): tier = "4" counter_tier4 += 1 - tier_dict[key1]["tier 4"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 4"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 4"] += 1 # assign tiers if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): trimmed_actual_high_tier = True elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])): trimmed_actual_high_tier = True elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): trimmed_actual_high_tier = True elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim])): trimmed_actual_high_tier = True elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) & - any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]) & + any(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim])) | (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) & - any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]) & + any(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim]))): trimmed_actual_high_tier = True elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & - all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | + all(float(ij) >= 0.75 for ij in [tier1ff_trim, tier4ff_trim])) | (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & - all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): + all(float(ij) >= 0.75 for ij in [tier2ff_trim, tier3ff_trim]))): trimmed_actual_high_tier = True else: trimmed_actual_high_tier = False @@ -1005,37 +1185,58 @@ elif softclipped_mutation_allMates: tier = "5.1" counter_tier51 += 1 - tier_dict[key1]["tier 5.1"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.1"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.1"] += 1 elif softclipped_mutation_oneOfTwoMates: tier = "5.2" counter_tier52 += 1 - tier_dict[key1]["tier 5.2"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.2"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.2"] += 1 elif softclipped_mutation_oneOfTwoSSCS: tier = "5.3" counter_tier53 += 1 - tier_dict[key1]["tier 5.3"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.3"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.3"] += 1 elif softclipped_mutation_oneMate: tier = "5.4" counter_tier54 += 1 - tier_dict[key1]["tier 5.4"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.4"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.4"] += 1 elif softclipped_mutation_oneMateOneSSCS: tier = "5.5" counter_tier55 += 1 - tier_dict[key1]["tier 5.5"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 5.5"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 5.5"] += 1 elif (contradictory): tier = "6" counter_tier6 += 1 - tier_dict[key1]["tier 6"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 6"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 6"] += 1 else: tier = "7" counter_tier7 += 1 - tier_dict[key1]["tier 7"] += 1 + if variant_type == "alt": + tier_dict[key1]["tier 7"] += 1 + elif variant_type == "ref": + tier_dict_ref[key1]["tier 7"] += 1 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) @@ -1114,13 +1315,12 @@ read_pos2 = read_len_median2 = None if (read_pos3 == -1): read_pos3 = read_len_median3 = None - line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) + line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) # ws1.write_row(row, 0, line) # csv_writer.writerow(line) - line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) + line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) # ws1.write_row(row + 1, 0, line2) # csv_writer.writerow(line2) - # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), # {'type': 'formula', # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), @@ -1155,12 +1355,19 @@ # write to file # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) + sum_highTiers_ref = sum([tier_dict_ref[key1][ij] for ij in list(sorted(tier_dict_ref[key1].keys()))[:6]]) correct_tier = False + correct_tier_ref = False if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] tier_dict[key1]["tier 4"] = 0 correct_tier = True + if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0: + tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"] + tier_dict_ref[key1]["tier 4"] = 0 + correct_tier_ref = True + for sample in change_tier_after_print: row_number = sample[0] line1 = sample[1] @@ -1168,44 +1375,47 @@ actual_high_tier = sample[3] current_tier = list(line1)[1] - if correct_tier and (current_tier == "4") and actual_high_tier: + if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier: + line1 = list(line1) + line1[1] = "2.5" + line1 = tuple(line1) + counter_tier25 += 1 + counter_tier4 -= 1 + if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier: line1 = list(line1) line1[1] = "2.5" line1 = tuple(line1) counter_tier25 += 1 counter_tier4 -= 1 + ws1.write_row(row_number, 0, line1) csv_writer.writerow(line1) ws1.write_row(row_number + 1, 0, line2) csv_writer.writerow(line2) + if line1[2] == "alt": + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + elif line1[2] == "ref": + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), - 'format': format1, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), - 'format': format3, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=$B${}>="3"'.format(row_number + 1), - 'format': format2, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) # sheet 2 if chimera_correction: - header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', - 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', - 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') + header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', + 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)', + 'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)', + 'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)', + 'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)' + ) else: - header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', - 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', - 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') - + header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', + 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)', + 'tier 3.1 (alt)', 'tier 3.2 (alt)', 'tier 4 (alt)', 'tier 5.1 (alt)', 'tier 5.2 (alt)', 'tier 5.3 (alt)', 'tier 5.4 (alt)', 'tier 5.5 (alt)', 'tier 6 (alt)', 'tier 7 (alt)', + 'tier 1.1 (ref)', 'tier 1.2 (ref)', 'tier 2.1 (ref)', 'tier 2.2 (ref)', 'tier 2.3 (ref)', 'tier 2.4 (ref)', 'tier 2.5 (ref)', + 'tier 3.1 (ref)', 'tier 3.2 (ref)', 'tier 4 (ref)', 'tier 5.1 (ref)', 'tier 5.2 (ref)', 'tier 5.3 (ref)', 'tier 5.4 (ref)', 'tier 5.5 (ref)', 'tier 6 (ref)', 'tier 7 (ref)' + ) ws2.write_row(0, 0, header_line2) row = 0 @@ -1220,9 +1430,12 @@ alt_count = cvrg_dict[key1][1] cvrg = ref_count + alt_count + ref_tiers = tier_dict_ref[key1] + var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) lst = [var_id, cvrg] used_tiers = [] + used_tiers_ref = [t for k, t in sorted(ref_tiers.items())] cum_af = [] for key2, value2 in sorted(value1.items()): # calculate cummulative AF @@ -1233,24 +1446,25 @@ if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place continue lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) - lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))]) + lst.extend([(sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])), sum(used_tiers_ref[0:7]), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])))]) if chimera_correction: chimeras_all = chimera_dict[key1][1] new_alt = sum(used_tiers[0:7]) - chimeras_all fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7]))) if fraction_chimeras is None: fraction_chimeras = 0. - new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras) + new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras) lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)]) lst.extend([alt_count, safe_div(alt_count, cvrg)]) lst.extend(used_tiers) - lst.extend(cum_af) + lst.extend(used_tiers_ref) + # lst.extend(cum_af) lst = tuple(lst) ws2.write_row(row + 1, 0, lst) if chimera_correction: - ws2.conditional_format('M{}:N{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$M$1="tier 1.1"', 'format': format12, 'multi_range': 'M{}:N{} M1:N1'.format(row + 2, row + 2)}) - ws2.conditional_format('O{}:S{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$O$1="tier 2.1"', 'format': format32, 'multi_range': 'O{}:S{} O1:S1'.format(row + 2, row + 2)}) - ws2.conditional_format('T{}:AC{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$T$1="tier 3.1"', 'format': format22, 'multi_range': 'T{}:AC{} T1:AC1'.format(row + 2, row + 2)}) + ws2.conditional_format('N{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$N$1="tier 1.1 (alt)"', 'format': format12, 'multi_range': 'N{}:O{} N1:O1 AE{}:AF{} AE1:AF1'.format(row + 2, row + 2, row + 2, row + 2)}) + ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 2.1 (alt)"', 'format': format32, 'multi_range': 'P{}:T{} P1:T1 AG{}:AK{} AG1:AK1'.format(row + 2, row + 2, row + 2, row + 2)}) + ws2.conditional_format('U{}:AD{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$U$1="tier 3.1 (alt)"', 'format': format22, 'multi_range': 'U{}:AD{} U1:AD1 AL{}:AU{} AL1:AU1'.format(row + 2, row + 2, row + 2, row + 2)}) else: ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) ws2.conditional_format('L{}:P{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:P{} L1:P1'.format(row + 2, row + 2)}) diff -r 1797e461d674 -r fdfe9a919ff7 read2mut.xml --- a/read2mut.xml Mon Mar 29 09:22:57 2021 +0000 +++ b/read2mut.xml Fri Jul 22 09:19:44 2022 +0000 @@ -1,6 +1,6 @@ - - Looks for reads with mutation at known positions and calculates frequencies and stats. + + Looks for reads with a mutation at known positions and calculates frequencies and stats. va_macros.xml @@ -28,12 +28,12 @@ ]]> - + - - + + @@ -57,10 +57,10 @@ - - - - + + + + `_ or `LoFreq `_ variant caller. **Dataset 2:** BAM file of aligned raw reads. This file can be obtained by the tool `Map with BWA-MEM `_. **Dataset 3:** JSON file generated by the **DCS mutations to tags/reads** tool -containing dictonaries of the tags of reads containing mutations +containing dictionaries of the tags of reads containing mutations in the DCS. **Dataset 4:** JSON file generated by the **DCS mutations to SSCS stats** tool -stats of tags that carry a mutation in the SSCS at the same position a mutation +stats of tags that carry a mutation and the reference allele in the SSCS at the same position a mutation is called in the DCS. **Output** -The output are three XLSX files containing frequencies stats for DCS mutations based -on information from the raw reads and a CSV file containing the summary information without color-coding. In addition to that a tier based -classification is provided based on the amout of support for a true variant call. +The output is three XLSX files containing frequencies stats for DCS mutations based +on information from the raw reads and a CSV file containing the summary information without color-coding. In addition to that, a tier-based +classification is provided based on the amount of support for a true variant call. ]]> diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Interesting_Reads_test.trim.bai Binary file test-data/Interesting_Reads_test.trim.bai has changed diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Interesting_Reads_test.trim.bam Binary file test-data/Interesting_Reads_test.trim.bam has changed diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_allele_frequencies_test.xlsx Binary file test-data/Variant_Analyzer_allele_frequencies_test.xlsx has changed diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_summary_test.csv --- a/test-data/Variant_Analyzer_summary_test.csv Mon Mar 29 09:22:57 2021 +0000 +++ b/test-data/Variant_Analyzer_summary_test.csv Fri Jul 22 09:19:44 2022 +0000 @@ -1,7 +1,11 @@ -variant ID,tier,tag,mate,read pos.ab,read pos.ba,read median length.ab,read median length.ba,DCS median length,FS.ab,FS.ba,FSqc.ab,FSqc.ba,ref.ab,ref.ba,alt.ab,alt.ba,rel. ref.ab,rel. ref.ba,rel. alt.ab,rel. alt.ba,na.ab,na.ba,lowq.ab,lowq.ba,trim.ab,trim.ba,SSCS alt.ab,SSCS alt.ba,SSCS ref.ab,SSCS ref.ba,in phase,chimeric tag -ACH_TDII_5regions-505-C-A,2.1,GATAACCTTGCTTCGTGATTAATC,ab1.ba2,132.0,131.0,264.0,263.0,173.0,1,3,1,3,0,0,1,3,0,0,1.0,1.0,0,0,0,0,0,0,2,1,1,1,, -,,GATAACCTTGCTTCGTGATTAATC,ab2.ba1,,,,,173.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,2,1,1,1,, -ACH_TDII_5regions-571-C-T,1.1,GATTGGATAACGTTGTGGCAATTG,ab1.ba2,129.0,218.0,195.0,284.0,143.0,4,5,4,5,0,0,4,5,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,, -,,GATTGGATAACGTTGTGGCAATTG,ab2.ba1,264.0,264.0,278.5,283.5,143.0,2,2,2,2,0,0,2,2,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,, -ACH_TDII_5regions-958-T-C,2.4,CCTCCCGGCAGTGCGAAAATGTCA,ab1.ba2,,,,,195.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,1,1,1,0,, -,,CCTCCCGGCAGTGCGAAAATGTCA,ab2.ba1,97.0,91.0,283.0,277.0,195.0,1,1,1,1,0,0,1,1,0,0,1.0,1.0,0,0,0,0,0,0,1,1,1,0,, +variant ID,tier,allele,tag,mate,read pos.ab,read pos.ba,read median length.ab,read median length.ba,DCS median length,FS.ab,FS.ba,FSqc.ab,FSqc.ba,ref.ab,ref.ba,alt.ab,alt.ba,rel. ref.ab,rel. ref.ba,rel. alt.ab,rel. alt.ba,na.ab,na.ba,lowq.ab,lowq.ba,trim.ab,trim.ba,SSCS alt.ab,SSCS alt.ba,SSCS ref.ab,SSCS ref.ba,in phase,chimeric tag +ACH_TDII_5regions-505-C-A,2.1,alt,GATAACCTTGCTTCGTGATTAATC,ab1.ba2,132.0,131.0,264.0,263.0,173.0,1,3,1,3,0,0,1,3,0,0,1.0,1.0,0,0,0,0,0,0,2,1,1,1,, +,,,GATAACCTTGCTTCGTGATTAATC,ab2.ba1,,,,,173.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,2,1,1,1,, +ACH_TDII_5regions-505-C-A,1.1,ref,GATTGGATAACGTTGTGGCAATTG,ab1.ba2,73.0,152.0,205.0,284.0,173.0,3,5,3,5,3,5,0,0,1.0,1.0,0,0,0,0,0,0,0,0,2,1,1,1,, +,,,GATTGGATAACGTTGTGGCAATTG,ab2.ba1,198.0,198.0,263.5,283.0,173.0,4,3,4,3,4,3,0,0,1.0,1.0,0,0,0,0,0,0,0,0,2,1,1,1,, +ACH_TDII_5regions-571-C-T,2.1,ref,GATAACCTTGCTTCGTGATTAATC,ab1.ba2,198.0,197.0,264.0,263.0,143.0,1,3,1,3,1,3,0,0,1.0,1.0,0,0,0,0,0,0,0,0,1,1,2,1,, +,,,GATAACCTTGCTTCGTGATTAATC,ab2.ba1,,,,,143.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,1,1,2,1,, +ACH_TDII_5regions-571-C-T,1.1,alt,GATTGGATAACGTTGTGGCAATTG,ab1.ba2,129.0,218.0,195.0,284.0,143.0,4,5,4,5,0,0,4,5,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,, +,,,GATTGGATAACGTTGTGGCAATTG,ab2.ba1,264.0,264.0,278.5,283.5,143.0,2,2,2,2,0,0,2,2,0,0,1.0,1.0,0,0,0,0,0,0,1,1,2,1,, +ACH_TDII_5regions-958-T-C,2.4,alt,CCTCCCGGCAGTGCGAAAATGTCA,ab1.ba2,,,,,195.0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,1,1,1,0,, +,,,CCTCCCGGCAGTGCGAAAATGTCA,ab2.ba1,97.0,91.0,283.0,277.0,195.0,1,1,1,1,0,0,1,1,0,0,1.0,1.0,0,0,0,0,0,0,1,1,1,0,, diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_summary_test.xlsx Binary file test-data/Variant_Analyzer_summary_test.xlsx has changed diff -r 1797e461d674 -r fdfe9a919ff7 test-data/Variant_Analyzer_tiers_test.xlsx Binary file test-data/Variant_Analyzer_tiers_test.xlsx has changed diff -r 1797e461d674 -r fdfe9a919ff7 test-data/tag_count_dict_test.json --- a/test-data/tag_count_dict_test.json Mon Mar 29 09:22:57 2021 +0000 +++ b/test-data/tag_count_dict_test.json Fri Jul 22 09:19:44 2022 +0000 @@ -1,1 +1,1 @@ -[{"GATAACCTTGCTTCGTGATTAATC": {"ACH_TDII_5regions#504#C#A": "A"}, "GATTGGATAACGTTGTGGCAATTG": {"ACH_TDII_5regions#570#C#T": "T"}, "CCTCCCGGCAGTGCGAAAATGTCA": {"ACH_TDII_5regions#957#T#C": "C"}}, {"ACH_TDII_5regions#957#T#C": [0, 1, 195.0], "ACH_TDII_5regions#504#C#A": [1, 1, 173.0], "ACH_TDII_5regions#570#C#T": [1, 1, 143.0]}] \ No newline at end of file +[{"GATAACCTTGCTTCGTGATTAATC": {"ACH_TDII_5regions#504#C#A": "A"}, "GATTGGATAACGTTGTGGCAATTG": {"ACH_TDII_5regions#570#C#T": "T"}, "CCTCCCGGCAGTGCGAAAATGTCA": {"ACH_TDII_5regions#957#T#C": "C"}}, {"ACH_TDII_5regions#957#T#C": [0, 1, 195.0], "ACH_TDII_5regions#504#C#A": [1, 1, 173.0], "ACH_TDII_5regions#570#C#T": [1, 1, 143.0]}, {"GATAACCTTGCTTCGTGATTAATC": {"ACH_TDII_5regions#570#C#T": "C"}, "GATTGGATAACGTTGTGGCAATTG": {"ACH_TDII_5regions#504#C#A": "C"}}] \ No newline at end of file