# HG changeset patch
# User mheinzl
# Date 1614699161 0
# Node ID d21960b45a6ba82a1e63a4abd5a5380e4b971ee0
# Parent da224c392a54109ebd6566b1ded49061abff733e
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
diff -r da224c392a54 -r d21960b45a6b mut2read.py
--- a/mut2read.py Wed Feb 24 14:20:17 2021 +0000
+++ b/mut2read.py Tue Mar 02 15:32:41 2021 +0000
@@ -11,7 +11,7 @@
======= ========== ================= ================================
Version Date Author Description
-2.0.0 2020-10-30 Gundula Povysil -
+0.2.1 2019-10-27 Gundula Povysil -
======= ========== ================= ================================
USAGE: python mut2read.py DCS_Mutations.tabular DCS.bam Aligned_Families.tabular Interesting_Reads.fastq tag_count_dict.json
@@ -62,6 +62,7 @@
sys.exit("Error: Could not find '{}'".format(file3))
# read dcs bam file
+# pysam.index(file2)
bam = pysam.AlignmentFile(file2, "rb")
# get tags
@@ -73,8 +74,14 @@
stop_pos = variant.start
#chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
ref = variant.REF
- alt = variant.ALT[0]
+ if len(variant.ALT) == 0:
+ continue
+ else:
+ alt = variant.ALT[0]
+ print(alt)
chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
+
+
dcs_len = []
if len(ref) == len(alt):
for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
@@ -145,3 +152,4 @@
if __name__ == '__main__':
sys.exit(mut2read(sys.argv))
+
diff -r da224c392a54 -r d21960b45a6b mut2read.xml
--- a/mut2read.xml Wed Feb 24 14:20:17 2021 +0000
+++ b/mut2read.xml Tue Mar 02 15:32:41 2021 +0000
@@ -4,7 +4,12 @@
va_macros.xml
-
+
+ python
+ matplotlib
+ pysam
+ cyvcf2
+
-
+
-
+
diff -r da224c392a54 -r d21960b45a6b mut2sscs.py
--- a/mut2sscs.py Wed Feb 24 14:20:17 2021 +0000
+++ b/mut2sscs.py Tue Mar 02 15:32:41 2021 +0000
@@ -11,7 +11,7 @@
======= ========== ================= ================================
Version Date Author Description
-2.0.0 2020-10-30 Gundula Povysil -
+0.2.1 2019-10-27 Gundula Povysil -
======= ========== ================= ================================
USAGE: python mut2sscs.py DCS_Mutations.tabular SSCS.bam SSCS_counts.json
@@ -25,6 +25,7 @@
import os
import sys
+import numpy as np
import pysam
from cyvcf2 import VCF
@@ -55,6 +56,7 @@
sys.exit("Error: Could not find '{}'".format(file2))
# read SSCS bam file
+# pysam.index(file2)
bam = pysam.AlignmentFile(file2, "rb")
# get tags
@@ -66,10 +68,14 @@
stop_pos = variant.start
#chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
ref = variant.REF
- alt = variant.ALT[0]
+ if len(variant.ALT) == 0:
+ continue
+ else:
+ alt = variant.ALT[0]
chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
if len(ref) == len(alt):
+
for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=1000000000):
if pileupcolumn.reference_pos == stop_pos:
count_alt = 0
@@ -131,3 +137,4 @@
if __name__ == '__main__':
sys.exit(mut2sscs(sys.argv))
+
diff -r da224c392a54 -r d21960b45a6b mut2sscs.xml
--- a/mut2sscs.xml Wed Feb 24 14:20:17 2021 +0000
+++ b/mut2sscs.xml Tue Mar 02 15:32:41 2021 +0000
@@ -4,7 +4,12 @@
va_macros.xml
-
+
+ python
+ matplotlib
+ pysam
+ cyvcf2
+
1:
+ # for k1 in keys:
+ # whole_array.append(k1)
+ # else:
+ # whole_array.append(keys[0])
# output summary with threshold
workbook = xlsxwriter.Workbook(outfile)
@@ -268,7 +286,7 @@
'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba',
'in phase', 'chimeric tag')
ws1.write_row(0, 0, header_line)
- csv_writer.writerow(header_line)
+
counter_tier11 = 0
counter_tier12 = 0
counter_tier21 = 0
@@ -277,15 +295,24 @@
counter_tier24 = 0
counter_tier31 = 0
counter_tier32 = 0
- counter_tier41 = 0
- counter_tier42 = 0
- counter_tier5 = 0
+ counter_tier33 = 0
+ counter_tier4 = 0
+ # if chimera_correction:
+ # counter_tier43 = 0
+ counter_tier51 = 0
+ counter_tier52 = 0
+ counter_tier53 = 0
+ counter_tier54 = 0
+ counter_tier55 = 0
counter_tier6 = 0
+ counter_tier7 = 0
+
row = 1
tier_dict = {}
chimera_dict = {}
for key1, value1 in sorted(mut_dict.items()):
counts_mut = 0
+ chimeric_tag_list = []
chimeric_tag = {}
if key1 in pure_tags_dict_short.keys():
i = np.where(np.array(['#'.join(str(i) for i in z)
@@ -293,11 +320,12 @@
ref = mut_array[i, 2]
alt = mut_array[i, 3]
dcs_median = cvrg_dict[key1][2]
- whole_array = list(pure_tags_dict_short[key1].keys())
+ whole_array = pure_tags_dict_short[key1].keys()
tier_dict[key1] = {}
- values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0),
- ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0), ("tier 6", 0)]
+ values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0),
+ ("tier 3.2", 0), ("tier 3.3", 0), ("tier 4", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
+ ("tier 6", 0), ("tier 7", 0)]
for k, v in values_tier_dict:
tier_dict[key1][k] = v
@@ -326,11 +354,15 @@
total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na']
+ # na1f = na1/total1
else:
+ # na1 = na1f = 0
na1 = 0
if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ']
+ # lowq1f = lowq1 / total1
else:
+ # lowq1 = lowq1f = 0
lowq1 = 0
if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref]
@@ -365,11 +397,15 @@
total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values())
if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na']
+ # na2f = na2 / total2
else:
+ # na2 = na2f = 0
na2 = 0
if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ']
+ # lowq2f = lowq2 / total2
else:
+ # lowq2 = lowq2f = 0
lowq2 = 0
if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref]
@@ -404,11 +440,15 @@
total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values())
if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na']
+ # na3f = na3 / total3
else:
+ # na3 = na3f = 0
na3 = 0
if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ']
+ # lowq3f = lowq3 / total3
else:
+ # lowq3 = lowq3f = 0
lowq3 = 0
if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref]
@@ -439,11 +479,15 @@
total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values())
if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na']
+ # na4f = na4 / total4
else:
+ # na4 = na4f = 0
na4 = 0
if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ']
+ # lowq4f = lowq4 / total4
else:
+ # lowq4 = lowq4f = 0
lowq4 = 0
if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref]
@@ -472,19 +516,38 @@
read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1
read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0
-
+ cigars_dcs1 = cigars_dcs2 = cigars_dcs3 = cigars_dcs4 = []
+ pos_read1 = pos_read2 = pos_read3 = pos_read4 = []
+ end_read1 = end_read2 = end_read3 = end_read4 = []
if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys():
- read_pos1 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])
- read_len_median1 = np.median(reads_dict[key1][key2[:-5] + '.ab.1'])
+ read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1']))
+ read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1']))
+ cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']
+ #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'])
+ pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1']
+ #print(cigars_dcs1)
+ end_read1 = reads_dict[key1][key2[:-5] + '.ab.1']
if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys():
- read_pos2 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])
- read_len_median2 = np.median(reads_dict[key1][key2[:-5] + '.ab.2'])
+ read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2']))
+ read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2']))
+ cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2']
+ pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2']
+ end_read2 = reads_dict[key1][key2[:-5] + '.ab.2']
if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys():
- read_pos3 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])
- read_len_median3 = np.median(reads_dict[key1][key2[:-5] + '.ba.1'])
+ read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1']))
+ read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1']))
+ cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1']
+ pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1']
+ end_read3 = reads_dict[key1][key2[:-5] + '.ba.1']
if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys():
- read_pos4 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])
- read_len_median4 = np.median(reads_dict[key1][key2[:-5] + '.ba.2'])
+ read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2']))
+ read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2']))
+ #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'])
+ cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
+
+ pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
+ #print(cigars_dcs4)
+ end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
used_keys.append(key2[:-5])
counts_mut += 1
@@ -515,145 +578,370 @@
details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
- trimmed_five = False
- trimmed_three = False
+ trimmed = False
contradictory = False
+ softclipped_mutation_allMates = False
+ softclipped_mutation_oneOfTwoMates = False
+ softclipped_mutation_oneOfTwoSSCS = False
+ softclipped_mutation_oneMate = False
+ softclipped_mutation_oneMateOneSSCS = False
+ print()
+ print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3)
+ dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = []
+ dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = []
+ ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False
+ ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
- if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
+ # mate 1 - SSCS ab
+ softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
+ ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads
+
+ if any(ij is True for ij in softclipped_idx1):
+ softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
+ softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1]
+ softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
+ dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
+ dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
+
+ # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+ if any(ij is True for ij in softclipped_both_ends_idx1):
+ print(softclipped_both_ends_idx1)
+ for nr, indx in enumerate(softclipped_both_ends_idx1):
+ if indx:
+ if dist_start_read1[nr] <= dist_end_read1[nr]:
+ dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
+ else:
+ dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number
+ ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads
+ ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
+ print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1)
+
+ # mate 1 - SSCS ba
+ softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
+ ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads
+ if any(ij is True for ij in softclipped_idx4):
+ softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
+ softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4]
+ softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
+ dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
+ dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
+
+ # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+ if any(ij is True for ij in softclipped_both_ends_idx4):
+ print(softclipped_both_ends_idx4)
+ for nr, indx in enumerate(softclipped_both_ends_idx4):
+ if indx:
+ if dist_start_read4[nr] <= dist_end_read4[nr]:
+ dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
+ else:
+ dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number
+ ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads
+ ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
+ print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4)
+
+ # mate 2 - SSCS ab
+ softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
+ #print(sum(softclipped_idx2))
+ ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads
+ if any(ij is True for ij in softclipped_idx2):
+ softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
+ softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2]
+ softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
+ dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
+ dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
+
+ # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+ if any(ij is True for ij in softclipped_both_ends_idx2):
+ print(softclipped_both_ends_idx2)
+ for nr, indx in enumerate(softclipped_both_ends_idx2):
+ if indx:
+ if dist_start_read2[nr] <= dist_end_read2[nr]:
+ dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
+ else:
+ dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number
+ ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads
+ #print(ratio_dist_end2)
+ #print([True if x <= thr else False for x in ratio_dist_end2])
+ ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
+ print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2)
+
+ # mate 2 - SSCS ba
+ softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
+ ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads
+ if any(ij is True for ij in softclipped_idx3):
+ softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
+ softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3]
+ softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
+ dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
+ dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
+
+ # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
+ if any(ij is True for ij in softclipped_both_ends_idx3):
+ print(softclipped_both_ends_idx3)
+ for nr, indx in enumerate(softclipped_both_ends_idx3):
+ if indx:
+ if dist_start_read3[nr] <= dist_end_read3[nr]:
+ dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
+ else:
+ dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number
+ #print([True if x <= thr else False for x in dist_start_read3])
+ ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
+ ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
+ print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3)
+
+ if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
+ all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
+ (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
+ all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
alt1ff = 0
alt4ff = 0
alt2ff = 0
alt3ff = 0
- trimmed_five = False
- trimmed_three = False
+ trimmed = False
contradictory = True
+ # softclipping tiers
+ # information of both mates available --> all reads for both mates and SSCS are softclipped
+ elif (ratio1 & ratio4 & ratio2 & ratio3 &
+ (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
+ all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+ # if distance between softclipping and mutation is at start or end of the read smaller than threshold
+ softclipped_mutation_allMates = True
+ softclipped_mutation_oneOfTwoMates = False
+ softclipped_mutation_oneOfTwoSSCS = False
+ softclipped_mutation_oneMate = False
+ softclipped_mutation_oneMateOneSSCS = False
+ alt1ff = 0
+ alt4ff = 0
+ alt2ff = 0
+ alt3ff = 0
+ trimmed = False
+ contradictory = False
+ print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates)
+ # information of both mates available --> only one mate softclipped
+ elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
+ (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
+ all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+ # if distance between softclipping and mutation is at start or end of the read smaller than threshold
+ softclipped_mutation_allMates = False
+ softclipped_mutation_oneOfTwoMates = True
+ softclipped_mutation_oneOfTwoSSCS = False
+ softclipped_mutation_oneMate = False
+ softclipped_mutation_oneMateOneSSCS = False
+ alt1ff = 0
+ alt4ff = 0
+ alt2ff = 0
+ alt3ff = 0
+ trimmed = False
+ contradictory = False
+ print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates)
+ # information of both mates available --> only one mate softclipped
+ elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
+ ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
+ all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
+ # if distance between softclipping and mutation is at start or end of the read smaller than threshold
+ softclipped_mutation_allMates = False
+ softclipped_mutation_oneOfTwoMates = False
+ softclipped_mutation_oneOfTwoSSCS = True
+ softclipped_mutation_oneMate = False
+ softclipped_mutation_oneMateOneSSCS = False
+ alt1ff = 0
+ alt4ff = 0
+ alt2ff = 0
+ alt3ff = 0
+ trimmed = False
+ contradictory = False
+ print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff])
+ # information of one mate available --> all reads of one mate are softclipped
+ elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
+ all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
+ (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
+ all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available
+ # if distance between softclipping and mutation is at start or end of the read smaller than threshold
+ #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) &
+ # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) |
+ # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) &
+ # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))):
+ softclipped_mutation_allMates = False
+ softclipped_mutation_oneOfTwoMates = False
+ softclipped_mutation_oneOfTwoSSCS = False
+ softclipped_mutation_oneMate = True
+ softclipped_mutation_oneMateOneSSCS = False
+ alt1ff = 0
+ alt4ff = 0
+ alt2ff = 0
+ alt3ff = 0
+ trimmed = False
+ contradictory = False
+ print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate)
+ # information of one mate available --> only one SSCS is softclipped
+ elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
+ (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
+ (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
+ (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available
+ # if distance between softclipping and mutation is at start or end of the read smaller than threshold
+ #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) |
+ # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) |
+ # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) |
+ # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))):
+ softclipped_mutation_allMates = False
+ softclipped_mutation_oneOfTwoMates = False
+ softclipped_mutation_oneOfTwoSSCS = False
+ softclipped_mutation_oneMate = False
+ softclipped_mutation_oneMateOneSSCS = True
+ alt1ff = 0
+ alt4ff = 0
+ alt2ff = 0
+ alt3ff = 0
+ trimmed = False
+ contradictory = False
+ print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS)
+
else:
- if ((read_pos1 >= 0) and (read_pos1 <= trim5)):
+ if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
beg1 = total1new
total1new = 0
alt1ff = 0
alt1f = 0
- trimmed_five = True
+ trimmed = True
- if ((read_pos1 >= 0) and (abs(read_len_median1 - read_pos1) <= trim3)):
- beg1 = total1new
- total1new = 0
- alt1ff = 0
- alt1f = 0
- trimmed_three = True
-
- if ((read_pos4 >= 0) and (read_pos4 <= trim5)):
+ if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))):
beg4 = total4new
total4new = 0
alt4ff = 0
alt4f = 0
- trimmed_five = True
+ trimmed = True
- if ((read_pos4 >= 0) and (abs(read_len_median4 - read_pos4) <= trim3)):
- beg4 = total4new
- total4new = 0
- alt4ff = 0
- alt4f = 0
- trimmed_three = True
-
- if ((read_pos2 >= 0) and (read_pos2 <= trim5)):
+ if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))):
beg2 = total2new
total2new = 0
alt2ff = 0
alt2f = 0
- trimmed_five = True
-
- if ((read_pos2 >= 0) and (abs(read_len_median2 - read_pos2) <= trim3)):
- beg2 = total2new
- total2new = 0
- alt2ff = 0
- alt2f = 0
- trimmed_three = True
+ trimmed = True
- if ((read_pos3 >= 0) and (read_pos3 <= trim5)):
- beg3 = total3new
- total3new = 0
- alt3ff = 0
- alt3f = 0
- trimmed_five = True
-
- if ((read_pos3 >= 0) and (abs(read_len_median3 - read_pos3) <= trim3)):
+ if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))):
beg3 = total3new
total3new = 0
alt3ff = 0
alt3f = 0
- trimmed_three = True
-
+ trimmed = True
details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
# assign tiers
- if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+ if ((all(int(ij) >= 3 for ij in [total1new, total4new]) &
+ all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+ (all(int(ij) >= 3 for ij in [total2new, total3new]) &
+ all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
tier = "1.1"
counter_tier11 += 1
tier_dict[key1]["tier 1.1"] += 1
- elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new])
- & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+ elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
+ any(int(ij) >= 3 for ij in [total1new, total4new]) &
+ any(int(ij) >= 3 for ij in [total2new, total3new]) &
+ all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
tier = "1.2"
counter_tier12 += 1
tier_dict[key1]["tier 1.2"] += 1
- elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))
- | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+ elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
+ any(int(ij) >= 3 for ij in [total1new, total4new]) &
+ all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+ (all(int(ij) >= 1 for ij in [total2new, total3new]) &
+ any(int(ij) >= 3 for ij in [total2new, total3new]) &
+ all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
tier = "2.1"
counter_tier21 += 1
tier_dict[key1]["tier 2.1"] += 1
- elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
+ elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) &
+ all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
tier = "2.2"
counter_tier22 += 1
tier_dict[key1]["tier 2.2"] += 1
- elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))
- | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
+ elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
+ any(int(ij) >= 3 for ij in [total2new, total3new]) &
+ all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
+ any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
+ (all(int(ij) >= 1 for ij in [total2new, total3new]) &
+ any(int(ij) >= 3 for ij in [total1new, total4new]) &
+ all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
+ any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
tier = "2.3"
counter_tier23 += 1
tier_dict[key1]["tier 2.3"] += 1
- elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))
- | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
+ elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
+ all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
+ (all(int(ij) >= 1 for ij in [total2new, total3new]) &
+ all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
tier = "2.4"
counter_tier24 += 1
tier_dict[key1]["tier 2.4"] += 1
- elif ((len(pure_tags_dict_short[key1]) > 1) & (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+ elif ((len(pure_tags_dict_short[key1]) > 1) &
+ (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
+ all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
tier = "3.1"
counter_tier31 += 1
tier_dict[key1]["tier 3.1"] += 1
- elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]))
- | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
+ elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) &
+ all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) |
+ (all(int(ij) >= 1 for ij in [total2new, total3new]) &
+ all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
tier = "3.2"
counter_tier32 += 1
tier_dict[key1]["tier 3.2"] += 1
- elif trimmed_five:
- tier = "4.1"
- counter_tier41 += 1
- tier_dict[key1]["tier 4.1"] += 1
+ elif (trimmed) and (len(pure_tags_dict_short[key1]) > 1):
+ tier = "3.3"
+ counter_tier33 += 1
+ tier_dict[key1]["tier 3.3"] += 1
+
+ elif (trimmed):
+ tier = "4"
+ counter_tier4 += 1
+ tier_dict[key1]["tier 4"] += 1
+
+ elif softclipped_mutation_allMates:
+ tier = "5.1"
+ counter_tier51 += 1
+ tier_dict[key1]["tier 5.1"] += 1
- elif trimmed_three:
- tier = "4.2"
- counter_tier42 += 1
- tier_dict[key1]["tier 4.2"] += 1
+ elif softclipped_mutation_oneOfTwoMates:
+ tier = "5.2"
+ counter_tier52 += 1
+ tier_dict[key1]["tier 5.2"] += 1
+
+ elif softclipped_mutation_oneOfTwoSSCS:
+ tier = "5.3"
+ counter_tier53 += 1
+ tier_dict[key1]["tier 5.3"] += 1
- elif contradictory:
- tier = "5"
- counter_tier5 += 1
- tier_dict[key1]["tier 5"] += 1
- else:
+ elif softclipped_mutation_oneMate:
+ tier = "5.4"
+ counter_tier54 += 1
+ tier_dict[key1]["tier 5.4"] += 1
+
+ elif softclipped_mutation_oneMateOneSSCS:
+ tier = "5.5"
+ counter_tier55 += 1
+ tier_dict[key1]["tier 5.5"] += 1
+
+ elif (contradictory):
tier = "6"
counter_tier6 += 1
tier_dict[key1]["tier 6"] += 1
+ else:
+ tier = "7"
+ counter_tier7 += 1
+ tier_dict[key1]["tier 7"] += 1
+
chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
- var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
+ var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
sample_tag = key2[:-5]
array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
# exclude identical tag from array2, to prevent comparison to itself
@@ -682,14 +970,14 @@
half1_mate2 = array2_half2
half2_mate2 = array2_half
# calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
- dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2])
+ dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
min_index = np.where(dist == dist.min()) # get index of min HD
# get all "b's" of the tag or all "a's" of the tag with minimum HD
min_tag_half2 = half2_mate2[min_index]
min_tag_array2 = array2[min_index] # get whole tag with min HD
min_value = dist.min()
# calculate HD of "b" to all "b's" or "a" to all "a's"
- dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e))
+ dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
for e in min_tag_half2])
dist2 = dist_second_half.max()
@@ -700,7 +988,14 @@
if min_value == 0 or dist2 == 0:
min_tags_list_zeros.append(tag)
chimera_tags.append(max_tag)
+ # chimeric = True
+ # else:
+ # chimeric = False
+ # if mate_b is False:
+ # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
+ # else:
+ # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
i += 1
chimera_tags = [x for x in chimera_tags if x != []]
chimera_tags_new = []
@@ -733,28 +1028,26 @@
read_pos3 = read_len_median3 = None
line = (var_id, tier, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
ws1.write_row(row, 0, line)
- csv_writer.writerow(line)
line = ("", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
ws1.write_row(row + 1, 0, line)
- csv_writer.writerow(line)
ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
{'type': 'formula',
'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
'format': format1,
- 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
+ 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
{'type': 'formula',
'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1),
'format': format3,
- 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
+ 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
{'type': 'formula',
'criteria': '=$B${}>="3"'.format(row + 1),
'format': format2,
- 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
+ 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+
row += 3
-
if chimera_correction:
chimeric_dcs_high_tiers = 0
chimeric_dcs = 0
@@ -767,23 +1060,19 @@
else:
chimeric_dcs_high_tiers += high_tiers
chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
- #csv_data.close()
-
# sheet 2
if chimera_correction:
header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
- 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
- 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
- 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6')
+ 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
+ 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+ 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
else:
header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
- 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
- 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5', 'AF 1.1-6')
+ 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
+ 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6')
ws2.write_row(0, 0, header_line2)
- #ws2.conditional_format('J1', {'type': 'formula', 'criteria': 'containing', 'value': 'tier 1.1', 'format': format1, 'multi_range': 'J1:K1'})
-
row = 0
for key1, value1 in sorted(tier_dict.items()):
@@ -797,7 +1086,7 @@
alt_count = cvrg_dict[key1][1]
cvrg = ref_count + alt_count
- var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
+ var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
lst = [var_id, cvrg]
used_tiers = []
cum_af = []
@@ -807,6 +1096,8 @@
if len(used_tiers) > 1:
cum = safe_div(sum(used_tiers), cvrg)
cum_af.append(cum)
+ if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place
+ continue
lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
if chimera_correction:
chimeras_all = chimera_dict[key1][0]
@@ -816,14 +1107,14 @@
fraction_chimeras = 0.
new_cvrg = cvrg * (1. - fraction_chimeras)
lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)])
- lst.extend([(cvrg - sum(used_tiers[-6:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-6:])))])
+ lst.extend([(cvrg - sum(used_tiers[-11:])), sum(used_tiers[0:6]), safe_div(sum(used_tiers[0:6]), (cvrg - sum(used_tiers[-11:])))])
if chimera_correction:
chimeras_all = chimera_dict[key1][1]
new_alt = sum(used_tiers[0:6]) - chimeras_all
fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:6])))
if fraction_chimeras is None:
fraction_chimeras = 0.
- new_cvrg = (cvrg - sum(used_tiers[-6:])) * (1. - fraction_chimeras)
+ new_cvrg = (cvrg - sum(used_tiers[-11:])) * (1. - fraction_chimeras)
lst.extend([chimeras_all, new_cvrg, safe_div(new_alt, new_cvrg)])
lst.extend([alt_count, safe_div(alt_count, cvrg)])
lst.extend(used_tiers)
@@ -833,18 +1124,19 @@
if chimera_correction:
ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)})
ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)})
- ws2.conditional_format('V{}:AA{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AA{} V1:AA1'.format(row + 2, row + 2)})
+ ws2.conditional_format('V{}:AF{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AF{} V1:AF1'.format(row + 2, row + 2)})
else:
ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
- ws2.conditional_format('P{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:U{} P1:U1'.format(row + 2, row + 2)})
+ ws2.conditional_format('P{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:Z{} P1:Z1'.format(row + 2, row + 2)})
row += 1
# sheet 3
sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
- ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
- ("tier 4.2", counter_tier42), ("tier 5", counter_tier5), ("tier 6", counter_tier6)]
+ ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 3.3", counter_tier33), ("tier 4", counter_tier),
+ ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52),
+ ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6), ("tier 7", counter_tier7)]
header = ("tier", "count")
ws3.write_row(0, 0, header)
@@ -854,15 +1146,15 @@
ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
{'type': 'formula',
'criteria': '=OR($A${}="tier 1.1", $A${}="tier 1.2")'.format(i + 2, i + 2),
- 'format': format13})
+ 'format': format1})
ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
{'type': 'formula',
'criteria': '=OR($A${}="tier 2.1", $A${}="tier 2.2", $A${}="tier 2.3", $A${}="tier 2.4")'.format(i + 2, i + 2, i + 2, i + 2),
- 'format': format33})
+ 'format': format3})
ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
{'type': 'formula',
'criteria': '=$A${}>="3"'.format(i + 2),
- 'format': format23})
+ 'format': format2})
description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""),
("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
@@ -872,88 +1164,87 @@
("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
- ("Tier 4.1", "variants at the beginning of the reads"),
- ("Tier 4.2", "variants at the end of the reads"),
- ("Tier 5", "mates with contradictory information"),
+ ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"),
+ ("Tier 5.1", "variant is close to softclipping in both mates"),
+ ("Tier 5.2", "variant is close to softclipping in one of the mates"),
+ ("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"),
+ ("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"),
+ ("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"),
("Tier 6", "remaining variants")]
- examples_tiers = [[("chr5-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
+ examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
"3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
"4081", "4098", "5", "10", "", ""),
("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None,
"289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None,
"0", "0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", "")],
- [("chr5-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289",
+ [("Chr5:5-20000-11068-C-G", "1.1", "AAAAATGCGTAGAAATATGC", "ab1.ba2", "254", "228", "287", "288", "289",
"33", "43", "33", "43", "0", "0", "33", "43", "0", "0", "1", "1", "0", "0", "0", "0", "0",
"0", "4081", "4098", "5", "10", "", ""),
("", "", "AAAAATGCGTAGAAATATGC", "ab2.ba1", "268", "268", "270", "288", "289",
"11", "34", "10", "27", "0", "0", "10", "27", "0", "0", "1", "1", "0", "0", "1",
"7", "0", "0", "4081", "4098", "5", "10", "", "")],
- [("chr5-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290",
+ [("Chr5:5-20000-10776-G-T", "1.2", "CTATGACCCGTGAGCCCATG", "ab1.ba2", "132", "132", "287", "288", "290",
"4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0",
"0", "0", "1", "6", "47170", "41149", "", ""),
("", "", "CTATGACCCGTGAGCCCATG", "ab2.ba1", "77", "132", "233", "200", "290",
"4", "1", "4", "1", "0", "0", "4", "1", "0", "0", "1", "1", "0", "0", "0", "0",
"0", "0", "1", "6", "47170", "41149", "", "")],
- [("chr5-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289",
+ [("Chr5:5-20000-11068-C-G", "2.1", "AAAAAAACATCATACACCCA", "ab1.ba2", "246", "244", "287", "288", "289",
"2", "8", "2", "8", "0", "0", "2", "8", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
"4081", "4098", "5", "10", "", ""),
("", "", "AAAAAAACATCATACACCCA", "ab2.ba1", None, None, None, None,
"289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", "0",
"0", "0", "0", "0", "4081", "4098", "5", "10", "", "")],
- [("chr5-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289",
+ [("Chr5:5-20000-11068-C-G", "2.2", "ATCAGCCATGGCTATTATTG", "ab1.ba2", "72", "72", "217", "288", "289",
"1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
"4081", "4098", "5", "10", "", ""),
("", "", "ATCAGCCATGGCTATTATTG", "ab2.ba1", "153", "164", "217", "260", "289",
"1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
"4081", "4098", "5", "10", "", "")],
- [("chr5-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None,
+ [("Chr5:5-20000-11068-C-G", "2.3", "ATCAATATGGCCTCGCCACG", "ab1.ba2", None, None, None, None,
"289", "0", "5", "0", "5", "0", "0", "0", "5", None, None, None, "1", "0",
"0", "0", "0", "0", "0", "4081", "4098", "5", "10", "", ""),
("", "", "ATCAATATGGCCTCGCCACG", "ab2.ba1", "202", "255", "277", "290", "289",
"1", "3", "1", "3", "0", "0", "1", "3", "0", "0", "1", "1", "0", "0", "0", "0",
"0", "0", "4081", "4098", "5", "10", "", "")],
- [("chr5-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
+ [("Chr5:5-20000-11068-C-G", "2.4", "ATCAGCCATGGCTATTTTTT", "ab1.ba2", "72", "72", "217", "288", "289",
"1", "1", "1", "1", "0", "0", "1", "1", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", "4081",
"4098", "5", "10", "", ""),
("", "", "ATCAGCCATGGCTATTTTTT", "ab2.ba1", "153", "164", "217", "260", "289",
"1", "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "0", "0", "0", "0", "4081",
"4098", "5", "10", "", "")],
- [("chr5-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290",
+ [("Chr5:5-20000-10776-G-T", "3.1", "ATGCCTACCTCATTTGTCGT", "ab1.ba2", "46", "15", "287", "288", "290",
"3", "3", "3", "2", "3", "1", "0", "1", "1", "0.5", "0", "0.5", "0", "0", "0", "1",
"0", "0", "3", "3", "47170", "41149", "", ""),
("", "", "ATGCCTACCTCATTTGTCGT", "ab2.ba1", None, "274", None,
"288", "290", "0", "3", "0", "2", "0", "1", "0", "1", None, "0.5", None, "0.5",
"0", "0", "0", "1", "0", "0", "3", "3", "47170", "41149", "", "")],
- [("chr5-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271",
+ [("Chr5:5-20000-11315-C-T", "3.2", "ACAACATCACGTATTCAGGT", "ab1.ba2", "197", "197", "240", "255", "271",
"2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
"0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", ""),
("", "", "ACAACATCACGTATTCAGGT", "ab2.ba1", "35", "35", "240", "258", "271",
"2", "3", "2", "3", "0", "1", "2", "2", "0", "0.333333333333333", "1",
"0.666666666666667", "0", "0", "0", "0", "0", "0", "1", "1", "6584", "6482", "", "")],
- [("chr5-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "1", "100", "255", "276", "269",
- "5", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
+ [("Chr5:5-20000-13983-G-C", "4.1", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "0", "100", "255", "276", "269",
+ "5", "6", "0", "6", "0", "0", "5", "6", "0", "0", "0", "1", "0", "0", "0", "0", "5", "0", "1", "1", "5348", "5350", "", ""),
("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
"269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
"0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
- [("chr5-13983-G-C", "4.2", "AAAAAAAGAATAACCCACAC", "ab1.ba2", "20", "270", "255", "276", "269",
- "5", "6", "5", "0", "0", "0", "5", "0", "0", "0", "1", "0", "0", "0", "0", "0", "0", "6", "1", "1", "5348", "5350", "", ""),
- ("", "", "AAAAAAAGAATAACCCACAC", "ab2.ba1", None, None, None, None,
- "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
- "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
- [("chr5-13963-T-C", "5", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
+ [("Chr5:5-20000-13963-T-C", "4.2", "TTTTTAAGAATAACCCACAC", "ab1.ba2", "38", "38", "240", "283", "263",
"110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
"0", "0", "0", "1", "1", "5348", "5350", "", ""),
("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
"7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
"0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
- [("chr5-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
+ [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)],
+ [("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
"0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
"0", "1", "1", "5348", "5350", "", ""),
("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
"269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
"0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
- start_row = 15
+ start_row = 20
ws3.write(start_row, 0, "Description of tiers with examples")
ws3.write_row(start_row + 1, 0, header_line)
row = 0
@@ -962,23 +1253,22 @@
ex = examples_tiers[i]
for k in range(len(ex)):
ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k])
- ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
+ ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
{'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),
'format': format33,
- 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
+ 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
{'type': 'formula',
'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2),
'format': format23,
- 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
+ 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)})
row += 3
workbook.close()
workbook2.close()
workbook3.close()
- csv_data.close()
-
if __name__ == '__main__':
sys.exit(read2mut(sys.argv))
+
diff -r da224c392a54 -r d21960b45a6b read2mut.xml
--- a/read2mut.xml Wed Feb 24 14:20:17 2021 +0000
+++ b/read2mut.xml Tue Mar 02 15:32:41 2021 +0000
@@ -1,12 +1,16 @@
-
+
Looks for reads with mutation at known positions and calculates frequencies and stats.
va_macros.xml
-
+
+ python
+ matplotlib
+ pysam
xlsxwriter
-
+ cyvcf2
+
@@ -33,13 +37,13 @@
-
-
+
+
+
-
@@ -51,10 +55,11 @@
-
-
+
+
+
+
-
@@ -70,7 +75,7 @@
**Input**
**Dataset 1:** VCF file with duplex consesus sequence (DCS) mutations. E.g.
-generated by the `FreeBayes `_ or `LoFreq `_ variant caller.
+generated by the `FreeBayes variant caller `_.
**Dataset 2:** BAM file of aligned raw reads. This file can be obtained by the
tool `Map with BWA-MEM `_.
@@ -86,7 +91,7 @@
**Output**
The output are three XLSX files containing frequencies stats for DCS mutations based
-on information from the raw reads and a CSV file containing the summary information without color-coding. In addition to that a tier based
+on information from the raw reads. In addition to that a tier based
classification is provided based on the amout of support for a true variant call.
]]>
diff -r da224c392a54 -r d21960b45a6b test-data/Variant_Analyzer_allele_frequencies_test.xlsx
Binary file test-data/Variant_Analyzer_allele_frequencies_test.xlsx has changed
diff -r da224c392a54 -r d21960b45a6b test-data/Variant_Analyzer_summary_test.xlsx
Binary file test-data/Variant_Analyzer_summary_test.xlsx has changed
diff -r da224c392a54 -r d21960b45a6b test-data/Variant_Analyzer_test.xlsx
Binary file test-data/Variant_Analyzer_test.xlsx has changed
diff -r da224c392a54 -r d21960b45a6b test-data/Variant_Analyzer_tiers_test.xlsx
Binary file test-data/Variant_Analyzer_tiers_test.xlsx has changed
diff -r da224c392a54 -r d21960b45a6b va_macros.xml
--- a/va_macros.xml Wed Feb 24 14:20:17 2021 +0000
+++ b/va_macros.xml Tue Mar 02 15:32:41 2021 +0000
@@ -1,21 +1,13 @@
-
-
- @misc{duplex,
- author = {Povysil, Gundula and Heinzl, Monika and Salazar, Renato and Stoler, Nicholas and Nekrutenko, Anton and Tiemann-Boege, Irene},
- year = {2020},
- title = {{Increased yields of duplex sequencing data by a series of quality control tools (manuscript)}}
- }
-
-
-
-
-
- matplotlib
- pysam
- cyvcf2
-
-
-
+
+
+ @misc{duplex,
+ author = {Povysil, Gundula and Heinzl, Monika and Salazar, Renato and Stoler, Nicholas and Nekrutenko, Anton and Tiemann-Boege, Irene},
+ year = {2019},
+ title = {{Variant Analyzer: a quality control for variant calling in duplex sequencing data (manuscript)}}
+ }
+
+
+
\ No newline at end of file