# HG changeset patch # User mheinzl # Date 1616512697 0 # Node ID 6ccff403db8a8cdd97016e73a91a0027ddec4709 # Parent 5023186c20618c3c76148e65d8b4176fba752e4a planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8 diff -r 5023186c2061 -r 6ccff403db8a mut2read.py --- a/mut2read.py Fri Mar 19 14:16:31 2021 +0000 +++ b/mut2read.py Tue Mar 23 15:18:17 2021 +0000 @@ -72,16 +72,12 @@ for variant in VCF(file1): chrom = variant.CHROM stop_pos = variant.start - #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) ref = variant.REF if len(variant.ALT) == 0: continue else: alt = variant.ALT[0] - print(alt) chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt - - dcs_len = [] if len(ref) == len(alt): for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): @@ -152,4 +148,3 @@ if __name__ == '__main__': sys.exit(mut2read(sys.argv)) - diff -r 5023186c2061 -r 6ccff403db8a mut2read.xml --- a/mut2read.xml Fri Mar 19 14:16:31 2021 +0000 +++ b/mut2read.xml Tue Mar 23 15:18:17 2021 +0000 @@ -1,5 +1,5 @@ - + Extracts all tags that carry a mutation in the duplex consensus sequence (DCS) va_macros.xml diff -r 5023186c2061 -r 6ccff403db8a mut2sscs.py --- a/mut2sscs.py Fri Mar 19 14:16:31 2021 +0000 +++ b/mut2sscs.py Tue Mar 23 15:18:17 2021 +0000 @@ -66,16 +66,13 @@ for variant in VCF(file1): chrom = variant.CHROM stop_pos = variant.start - #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) ref = variant.REF if len(variant.ALT) == 0: continue else: alt = variant.ALT[0] chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt - if len(ref) == len(alt): - for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000): if pileupcolumn.reference_pos == stop_pos: count_alt = 0 @@ -137,4 +134,3 @@ if __name__ == '__main__': sys.exit(mut2sscs(sys.argv)) - diff -r 5023186c2061 -r 6ccff403db8a mut2sscs.xml --- a/mut2sscs.xml Fri Mar 19 14:16:31 2021 +0000 +++ b/mut2sscs.xml Tue Mar 23 15:18:17 2021 +0000 @@ -1,5 +1,5 @@ - + Extracts all tags from the single stranded consensus sequence (SSCS) bam file that carry a mutation at the same position a mutation is called in the duplex consensus sequence (DCS) and calculates their frequencies va_macros.xml diff -r 5023186c2061 -r 6ccff403db8a read2mut.py --- a/read2mut.py Fri Mar 19 14:16:31 2021 +0000 +++ b/read2mut.py Tue Mar 23 15:18:17 2021 +0000 @@ -131,11 +131,8 @@ mut_array = [] for count, variant in enumerate(VCF(file1)): - #if count == 2000: - # break chrom = variant.CHROM stop_pos = variant.start - #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) ref = variant.REF if len(variant.ALT) == 0: continue @@ -161,17 +158,14 @@ count_other = 0 count_lowq = 0 n = 0 - #print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), - # "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) for pileupread in pileupcolumn.pileups: n += 1 if not pileupread.is_del and not pileupread.is_refskip: tag = pileupread.alignment.query_name nuc = pileupread.alignment.query_sequence[pileupread.query_position] phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 - # if read is softclipped, store real position in reference - if "S" in pileupread.alignment.cigarstring: + if "S" in pileupread.alignment.cigarstring: # spftclipped at start if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) @@ -198,9 +192,9 @@ mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] real_start_end[chrom_stop_pos][tag] = [(start, end)] else: - mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) - reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) - mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) + mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) + reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) + mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) real_start_end[chrom_stop_pos][tag].append((start, end)) if nuc == alt: count_alt += 1 @@ -220,9 +214,6 @@ else: count_indel += 1 - #print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) - #else: - # print("indels are currently not evaluated") mut_array = np.array(mut_array) for read in bam.fetch(until_eof=True): if read.is_unmapped: @@ -242,10 +233,6 @@ # create pure_tags_dict pure_tags_dict = {} for key1, value1 in sorted(mut_dict.items()): - #if len(np.where(np.array(['#'.join(str(i) for i in z) - # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0: - # continue - i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] ref = mut_array[i, 2] @@ -336,7 +323,7 @@ whole_array = pure_tags_dict_short[key1].keys() tier_dict[key1] = {} - values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0),("tier 2.5", 0), + values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 2.5", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0), ("tier 6", 0), ("tier 7", 0)] for k, v in values_tier_dict: @@ -619,41 +606,41 @@ softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads if any(ij is True for ij in softclipped_idx1): - softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] - softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] - softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] - dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] - dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] - # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping - if any(ij is True for ij in softclipped_both_ends_idx1): - for nr, indx in enumerate(softclipped_both_ends_idx1): - if indx: - if dist_start_read1[nr] <= dist_end_read1[nr]: - dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number - else: - dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number - ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads - ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads + softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] + softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] + softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] + dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] + dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + if any(ij is True for ij in softclipped_both_ends_idx1): + for nr, indx in enumerate(softclipped_both_ends_idx1): + if indx: + if dist_start_read1[nr] <= dist_end_read1[nr]: + dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number + else: + dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number + ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads + ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads # mate 1 - SSCS ba softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads if any(ij is True for ij in softclipped_idx4): - softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] - softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] - softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] - dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] - dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] - # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping - if any(ij is True for ij in softclipped_both_ends_idx4): - for nr, indx in enumerate(softclipped_both_ends_idx4): - if indx: - if dist_start_read4[nr] <= dist_end_read4[nr]: - dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number - else: - dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number - ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads - ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads + softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] + softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] + softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] + dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] + dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + if any(ij is True for ij in softclipped_both_ends_idx4): + for nr, indx in enumerate(softclipped_both_ends_idx4): + if indx: + if dist_start_read4[nr] <= dist_end_read4[nr]: + dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number + else: + dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number + ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads + ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads # mate 2 - SSCS ab softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] @@ -664,14 +651,14 @@ softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] - # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping if any(ij is True for ij in softclipped_both_ends_idx2): for nr, indx in enumerate(softclipped_both_ends_idx2): if indx: if dist_start_read2[nr] <= dist_end_read2[nr]: - dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number + dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number else: - dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number + dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads @@ -684,14 +671,14 @@ softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] - # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping + # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping if any(ij is True for ij in softclipped_both_ends_idx3): for nr, indx in enumerate(softclipped_both_ends_idx3): if indx: if dist_start_read3[nr] <= dist_end_read3[nr]: - dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh + dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh else: - dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number + dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads @@ -707,10 +694,10 @@ contradictory = True # softclipping tiers # information of both mates available --> all reads for both mates and SSCS are softclipped - elif (ratio1 & ratio4 & ratio2 & ratio3 & + elif (ratio1 & ratio4 & ratio2 & ratio3 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available - # if distance between softclipping and mutation is at start or end of the read smaller than threshold + all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = True softclipped_mutation_oneOfTwoMates = False softclipped_mutation_oneOfTwoSSCS = False @@ -726,13 +713,13 @@ # information of both mates available --> only one mate softclipped elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available - # if distance between softclipping and mutation is at start or end of the read smaller than threshold - min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red - min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue - max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red - max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue - if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue + all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold + min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red + min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue + max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red + max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue + if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue softclipped_mutation_oneOfTwoMates = False # blue mate at beginning softclipped if min_start1 > min_start2: @@ -742,19 +729,19 @@ read_len_median2 = read_len_median2 - n_spacer_barcode read_len_median3 = read_len_median3 - n_spacer_barcode # red mate at end softclipped - if max_end1 > max_end2: + if max_end1 > max_end2: n_spacer_barcode = max_end1 - max_end2 read_len_median1 = read_len_median1 - n_spacer_barcode read_len_median4 = read_len_median4 - n_spacer_barcode - elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red + elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red softclipped_mutation_oneOfTwoMates = False - if min_start1 < min_start2: + if min_start1 < min_start2: n_spacer_barcode = min_start2 - min_start1 read_pos1 = read_pos1 - n_spacer_barcode read_pos4 = read_pos4 - n_spacer_barcode read_len_median1 = read_len_median1 - n_spacer_barcode read_len_median4 = read_len_median4 - n_spacer_barcode - if max_end1 < max_end2: # if mate1 ends after mate 2 starts + if max_end1 < max_end2: # if mate1 ends after mate 2 starts n_spacer_barcode = max_end2 - max_end1 read_len_median2 = read_len_median2 - n_spacer_barcode read_len_median3 = read_len_median3 - n_spacer_barcode @@ -770,8 +757,8 @@ softclipped_mutation_oneOfTwoSSCS = False softclipped_mutation_oneMate = False softclipped_mutation_oneMateOneSSCS = False - - if softclipped_mutation_oneOfTwoMates is False: # check trimming tier + + if softclipped_mutation_oneOfTwoMates is False: # check trimming tier if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): beg1 = total1new total1new = 0 @@ -805,8 +792,8 @@ # information of both mates available --> only one mate softclipped elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & - all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available - # if distance between softclipping and mutation is at start or end of the read smaller than threshold + all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False softclipped_mutation_oneOfTwoSSCS = True @@ -823,9 +810,9 @@ # information of one mate available --> all reads of one mate are softclipped elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | - (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & - all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available - # if distance between softclipping and mutation is at start or end of the read smaller than threshold + (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & + all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False softclipped_mutation_oneOfTwoSSCS = False @@ -841,9 +828,9 @@ # information of one mate available --> only one SSCS is softclipped elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | - (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & - (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available - # if distance between softclipping and mutation is at start or end of the read smaller than threshold + (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & + (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available + # if distance between softclipping and mutation is at start or end of the read smaller than threshold softclipped_mutation_allMates = False softclipped_mutation_oneOfTwoMates = False softclipped_mutation_oneOfTwoSSCS = False @@ -961,7 +948,7 @@ counter_tier4 += 1 tier_dict[key1]["tier 4"] += 1 - # assign tiers + # assign tiers if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & @@ -1112,23 +1099,23 @@ if (read_pos3 == -1): read_pos3 = read_len_median3 = None line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) - #ws1.write_row(row, 0, line) - #csv_writer.writerow(line) + # ws1.write_row(row, 0, line) + # csv_writer.writerow(line) line2 = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) - #ws1.write_row(row + 1, 0, line2) - #csv_writer.writerow(line2) + # ws1.write_row(row + 1, 0, line2) + # csv_writer.writerow(line2) - #ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), + # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), # {'type': 'formula', # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), # 'format': format1, # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) - #ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), + # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), # {'type': 'formula', # 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), # 'format': format3, # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) - #ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), + # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), # {'type': 'formula', # 'criteria': '=$B${}>="3"'.format(row + 1), # 'format': format2, @@ -1177,27 +1164,26 @@ csv_writer.writerow(line2) ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), - 'format': format1, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + {'type': 'formula', + 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), + 'format': format1, + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), - 'format': format3, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + {'type': 'formula', + 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), + 'format': format3, + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) ws1.conditional_format('L{}:M{}'.format(row_number + 1, row_number + 2), - {'type': 'formula', - 'criteria': '=$B${}>="3"'.format(row_number + 1), - 'format': format2, - 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - + {'type': 'formula', + 'criteria': '=$B${}>="3"'.format(row_number + 1), + 'format': format2, + 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) # sheet 2 if chimera_correction: header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', - 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', - 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', - 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') + 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', + 'tier 3.1', 'tier 3.2', 'tier 4', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'tier 7', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', + 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-2.5', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6', 'AF 1.1-7') else: header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)', 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 'tier 2.5', @@ -1228,8 +1214,8 @@ if len(used_tiers) > 1: cum = safe_div(sum(used_tiers), cvrg) cum_af.append(cum) - if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place - continue + if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place + continue lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) lst.extend([(cvrg - sum(used_tiers[-10:])), sum(used_tiers[0:7]), safe_div(sum(used_tiers[0:7]), (cvrg - sum(used_tiers[-10:])))]) if chimera_correction: @@ -1257,7 +1243,7 @@ # sheet 3 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21), - ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), ("tier 2.5", counter_tier25), + ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), ("tier 2.5", counter_tier25), ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4", counter_tier4), ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52), ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)] @@ -1403,4 +1389,3 @@ if __name__ == '__main__': sys.exit(read2mut(sys.argv)) - diff -r 5023186c2061 -r 6ccff403db8a read2mut.xml --- a/read2mut.xml Fri Mar 19 14:16:31 2021 +0000 +++ b/read2mut.xml Tue Mar 23 15:18:17 2021 +0000 @@ -1,5 +1,5 @@ - + Looks for reads with mutation at known positions and calculates frequencies and stats. va_macros.xml