comparison read2mut.py @ 11:84a1a3f70407 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Mon, 15 Feb 2021 21:53:24 +0000
parents e18c5293aac7
children 7a418148319d
comparison
equal deleted inserted replaced
10:e18c5293aac7 11:84a1a3f70407
8 Looks for reads with mutation at known 8 Looks for reads with mutation at known
9 positions and calculates frequencies and stats. 9 positions and calculates frequencies and stats.
10 10
11 ======= ========== ================= ================================ 11 ======= ========== ================= ================================
12 Version Date Author Description 12 Version Date Author Description
13 0.2.1 2019-10-27 Gundula Povysil - 13 2.0.0 2020-10-30 Gundula Povysil -
14 ======= ========== ================= ================================ 14 ======= ========== ================= ================================
15 15
16 16
17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam 17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam
18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json 18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json
21 """ 21 """
22 22
23 from __future__ import division 23 from __future__ import division
24 24
25 import argparse 25 import argparse
26 import itertools
27 import json 26 import json
28 import operator 27 import operator
29 import os 28 import os
30 import re 29 import re
31 import sys 30 import sys
45 parser.add_argument('--inputJson', 44 parser.add_argument('--inputJson',
46 help='JSON file with data collected by mut2read.py.') 45 help='JSON file with data collected by mut2read.py.')
47 parser.add_argument('--sscsJson', 46 parser.add_argument('--sscsJson',
48 help='JSON file with SSCS counts collected by mut2sscs.py.') 47 help='JSON file with SSCS counts collected by mut2sscs.py.')
49 parser.add_argument('--outputFile', 48 parser.add_argument('--outputFile',
50 help='Output xlsx file with summary of mutations.') 49 help='Output xlsx file of mutation details.')
51 parser.add_argument('--outputFile2',
52 help='Output xlsx file with allele frequencies of mutations.')
53 parser.add_argument('--outputFile3',
54 help='Output xlsx file with examples of the tier classification.')
55 parser.add_argument('--thresh', type=int, default=0, 50 parser.add_argument('--thresh', type=int, default=0,
56 help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.') 51 help='Integer threshold for displaying mutations. Only mutations occuring less than thresh times are displayed. Default of 0 displays all.')
57 parser.add_argument('--phred', type=int, default=20, 52 parser.add_argument('--phred', type=int, default=20,
58 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.') 53 help='Integer threshold for Phred score. Only reads higher than this threshold are considered. Default 20.')
59 parser.add_argument('--trim', type=int, default=10, 54 parser.add_argument('--trim', type=int, default=10,
60 help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.') 55 help='Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10.')
61 parser.add_argument('--chimera_correction', action="store_true", 56 parser.add_argument('--chimera_correction', action="store_true",
62 help='Count chimeric variants and correct the variant frequencies') 57 help='Count chimeric variants and correct the variant frequencies')
63 parser.add_argument('--softclipping_dist', type=int, default=15,
64 help='Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the read.')
65 parser.add_argument('--reads_threshold', type=float, default=1.0,
66 help='Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped.')
67 return parser 58 return parser
68 59
69 60
70 def safe_div(x, y): 61 def safe_div(x, y):
71 if y == 0: 62 if y == 0:
79 file1 = args.mutFile 70 file1 = args.mutFile
80 file2 = args.bamFile 71 file2 = args.bamFile
81 json_file = args.inputJson 72 json_file = args.inputJson
82 sscs_json = args.sscsJson 73 sscs_json = args.sscsJson
83 outfile = args.outputFile 74 outfile = args.outputFile
84 outfile2 = args.outputFile2
85 outfile3 = args.outputFile3
86 thresh = args.thresh 75 thresh = args.thresh
87 phred_score = args.phred 76 phred_score = args.phred
88 trim = args.trim 77 trim = args.trim
89 chimera_correction = args.chimera_correction 78 chimera_correction = args.chimera_correction
90 thr = args.softclipping_dist
91 threshold_reads = args.reads_threshold
92 79
93 if os.path.isfile(file1) is False: 80 if os.path.isfile(file1) is False:
94 sys.exit("Error: Could not find '{}'".format(file1)) 81 sys.exit("Error: Could not find '{}'".format(file1))
95 if os.path.isfile(file2) is False: 82 if os.path.isfile(file2) is False:
96 sys.exit("Error: Could not find '{}'".format(file2)) 83 sys.exit("Error: Could not find '{}'".format(file2))
100 sys.exit("Error: thresh is '{}', but only non-negative integers allowed".format(thresh)) 87 sys.exit("Error: thresh is '{}', but only non-negative integers allowed".format(thresh))
101 if phred_score < 0: 88 if phred_score < 0:
102 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score)) 89 sys.exit("Error: phred is '{}', but only non-negative integers allowed".format(phred_score))
103 if trim < 0: 90 if trim < 0:
104 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh)) 91 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thresh))
105 if thr <= 0:
106 sys.exit("Error: trim is '{}', but only non-negative integers allowed".format(thr))
107 92
108 # load dicts 93 # load dicts
109 with open(json_file, "r") as f: 94 with open(json_file, "r") as f:
110 (tag_dict, cvrg_dict) = json.load(f) 95 (tag_dict, cvrg_dict) = json.load(f)
111 96
112 with open(sscs_json, "r") as f: 97 with open(sscs_json, "r") as f:
113 (mut_pos_dict, ref_pos_dict) = json.load(f) 98 (mut_pos_dict, ref_pos_dict) = json.load(f)
114 99
115 # read bam file 100 # read bam file
116 # pysam.index(file2)
117 bam = pysam.AlignmentFile(file2, "rb") 101 bam = pysam.AlignmentFile(file2, "rb")
118 102
119 # create mut_dict 103 # create mut_dict
120 mut_dict = {} 104 mut_dict = {}
121 mut_read_pos_dict = {} 105 mut_read_pos_dict = {}
122 mut_read_dict = {} 106 mut_read_dict = {}
123 reads_dict = {} 107 reads_dict = {}
124 mut_read_cigar_dict = {}
125 i = 0 108 i = 0
126 mut_array = [] 109 mut_array = []
127 110
128 for count, variant in enumerate(VCF(file1)): 111 for variant in VCF(file1):
129 #if count == 2000:
130 # break
131 chrom = variant.CHROM 112 chrom = variant.CHROM
132 stop_pos = variant.start 113 stop_pos = variant.start
133 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 114 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
134 ref = variant.REF 115 ref = variant.REF
135 if len(variant.ALT) == 0: 116 alt = variant.ALT[0]
136 continue
137 else:
138 alt = variant.ALT[0]
139 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt 117 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
140 118
141 if len(ref) == len(alt): 119 if len(ref) == len(alt):
142 mut_array.append([chrom, stop_pos, ref, alt]) 120 mut_array.append([chrom, stop_pos, ref, alt])
143 i += 1 121 i += 1
144 mut_dict[chrom_stop_pos] = {} 122 mut_dict[chrom_stop_pos] = {}
145 mut_read_pos_dict[chrom_stop_pos] = {} 123 mut_read_pos_dict[chrom_stop_pos] = {}
146 reads_dict[chrom_stop_pos] = {} 124 reads_dict[chrom_stop_pos] = {}
147 mut_read_cigar_dict[chrom_stop_pos] = {}
148 125
149 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): 126 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000):
150 if pileupcolumn.reference_pos == stop_pos: 127 if pileupcolumn.reference_pos == stop_pos:
151 count_alt = 0 128 count_alt = 0
152 count_ref = 0 129 count_ref = 0
153 count_indel = 0 130 count_indel = 0
154 count_n = 0 131 count_n = 0
155 count_other = 0 132 count_other = 0
156 count_lowq = 0 133 count_lowq = 0
157 n = 0 134 n = 0
158 #print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), 135 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups),
159 # "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) 136 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n)
160 for pileupread in pileupcolumn.pileups: 137 for pileupread in pileupcolumn.pileups:
161 n += 1 138 n += 1
162 if not pileupread.is_del and not pileupread.is_refskip: 139 if not pileupread.is_del and not pileupread.is_refskip:
163 tag = pileupread.alignment.query_name 140 tag = pileupread.alignment.query_name
164 nuc = pileupread.alignment.query_sequence[pileupread.query_position] 141 nuc = pileupread.alignment.query_sequence[pileupread.query_position]
170 if nuc in mut_dict[chrom_stop_pos][tag]: 147 if nuc in mut_dict[chrom_stop_pos][tag]:
171 mut_dict[chrom_stop_pos][tag][nuc] += 1 148 mut_dict[chrom_stop_pos][tag][nuc] += 1
172 else: 149 else:
173 mut_dict[chrom_stop_pos][tag][nuc] = 1 150 mut_dict[chrom_stop_pos][tag][nuc] = 1
174 if tag not in mut_read_pos_dict[chrom_stop_pos]: 151 if tag not in mut_read_pos_dict[chrom_stop_pos]:
175 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] 152 mut_read_pos_dict[chrom_stop_pos][tag] = np.array(pileupread.query_position) + 1
176 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] 153 reads_dict[chrom_stop_pos][tag] = len(pileupread.alignment.query_sequence)
177 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring]
178 else: 154 else:
179 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) 155 mut_read_pos_dict[chrom_stop_pos][tag] = np.append(
180 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) 156 mut_read_pos_dict[chrom_stop_pos][tag], pileupread.query_position + 1)
181 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) 157 reads_dict[chrom_stop_pos][tag] = np.append(
158 reads_dict[chrom_stop_pos][tag], len(pileupread.alignment.query_sequence))
159
182 if nuc == alt: 160 if nuc == alt:
183 count_alt += 1 161 count_alt += 1
184 if tag not in mut_read_dict: 162 if tag not in mut_read_dict:
185 mut_read_dict[tag] = {} 163 mut_read_dict[tag] = {}
186 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) 164 mut_read_dict[tag][chrom_stop_pos] = (alt, ref)
195 else: 173 else:
196 count_other += 1 174 count_other += 1
197 else: 175 else:
198 count_indel += 1 176 count_indel += 1
199 177
200 #print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq)) 178 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s\n" % (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_indel, count_lowq))
201 #else: 179 else:
202 # print("indels are currently not evaluated") 180 print("indels are currently not evaluated")
181
203 mut_array = np.array(mut_array) 182 mut_array = np.array(mut_array)
204 for read in bam.fetch(until_eof=True): 183 for read in bam.fetch(until_eof=True):
205 if read.is_unmapped: 184 if read.is_unmapped:
206 pure_tag = read.query_name[:-5] 185 pure_tag = read.query_name[:-5]
207 nuc = "na" 186 nuc = "na"
217 bam.close() 196 bam.close()
218 197
219 # create pure_tags_dict 198 # create pure_tags_dict
220 pure_tags_dict = {} 199 pure_tags_dict = {}
221 for key1, value1 in sorted(mut_dict.items()): 200 for key1, value1 in sorted(mut_dict.items()):
222 #if len(np.where(np.array(['#'.join(str(i) for i in z)
223 # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0:
224 # continue
225
226 i = np.where(np.array(['#'.join(str(i) for i in z) 201 i = np.where(np.array(['#'.join(str(i) for i in z)
227 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] 202 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
228 ref = mut_array[i, 2] 203 ref = mut_array[i, 2]
229 alt = mut_array[i, 3] 204 alt = mut_array[i, 3]
230 pure_tags_dict[key1] = {} 205 pure_tags_dict[key1] = {}
244 if len(value) < thresh: 219 if len(value) < thresh:
245 pure_tags_dict_short[key] = value 220 pure_tags_dict_short[key] = value
246 else: 221 else:
247 pure_tags_dict_short = pure_tags_dict 222 pure_tags_dict_short = pure_tags_dict
248 223
249 # whole_array = []
250 # for k in pure_tags_dict.values():
251 # if len(k) != 0:
252 # keys = k.keys()
253 # if len(keys) > 1:
254 # for k1 in keys:
255 # whole_array.append(k1)
256 # else:
257 # whole_array.append(keys[0])
258
259 # output summary with threshold 224 # output summary with threshold
260 workbook = xlsxwriter.Workbook(outfile) 225 workbook = xlsxwriter.Workbook(outfile)
261 workbook2 = xlsxwriter.Workbook(outfile2) 226 workbook2 = xlsxwriter.Workbook(outfile2)
262 workbook3 = xlsxwriter.Workbook(outfile3) 227 workbook3 = xlsxwriter.Workbook(outfile3)
263 ws1 = workbook.add_worksheet("Results") 228 ws1 = workbook.add_worksheet("Results")
282 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba', 247 'rel. ref.ab', 'rel. ref.ba', 'rel. alt.ab', 'rel. alt.ba',
283 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba', 248 'na.ab', 'na.ba', 'lowq.ab', 'lowq.ba', 'trim.ab', 'trim.ba',
284 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba', 249 'SSCS alt.ab', 'SSCS alt.ba', 'SSCS ref.ab', 'SSCS ref.ba',
285 'in phase', 'chimeric tag') 250 'in phase', 'chimeric tag')
286 ws1.write_row(0, 0, header_line) 251 ws1.write_row(0, 0, header_line)
287
288 counter_tier11 = 0 252 counter_tier11 = 0
289 counter_tier12 = 0 253 counter_tier12 = 0
290 counter_tier21 = 0 254 counter_tier21 = 0
291 counter_tier22 = 0 255 counter_tier22 = 0
292 counter_tier23 = 0 256 counter_tier23 = 0
293 counter_tier24 = 0 257 counter_tier24 = 0
294 counter_tier31 = 0 258 counter_tier31 = 0
295 counter_tier32 = 0 259 counter_tier32 = 0
296 counter_tier41 = 0 260 counter_tier41 = 0
297 counter_tier42 = 0 261 counter_tier42 = 0
298 # if chimera_correction: 262 counter_tier5 = 0
299 # counter_tier43 = 0
300 counter_tier51 = 0
301 counter_tier52 = 0
302 counter_tier53 = 0
303 counter_tier54 = 0
304 counter_tier55 = 0
305 counter_tier6 = 0
306
307 row = 1 263 row = 1
308 tier_dict = {} 264 tier_dict = {}
309 chimera_dict = {} 265 chimera_dict = {}
310 for key1, value1 in sorted(mut_dict.items()): 266 for key1, value1 in sorted(mut_dict.items()):
311 counts_mut = 0 267 counts_mut = 0
312 chimeric_tag_list = []
313 chimeric_tag = {} 268 chimeric_tag = {}
314 if key1 in pure_tags_dict_short.keys(): 269 if key1 in pure_tags_dict_short.keys():
315 i = np.where(np.array(['#'.join(str(i) for i in z) 270 i = np.where(np.array(['#'.join(str(i) for i in z)
316 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] 271 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
317 ref = mut_array[i, 2] 272 ref = mut_array[i, 2]
318 alt = mut_array[i, 3] 273 alt = mut_array[i, 3]
319 dcs_median = cvrg_dict[key1][2] 274 dcs_median = cvrg_dict[key1][2]
320 whole_array = pure_tags_dict_short[key1].keys() 275 whole_array = list(pure_tags_dict_short[key1].keys())
321 276
322 tier_dict[key1] = {} 277 tier_dict[key1] = {}
323 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), 278 values_tier_dict = [("tier 1.1", 0), ("tier 1.2", 0), ("tier 2.1", 0), ("tier 2.2", 0), ("tier 2.3", 0), ("tier 2.4", 0), ("tier 3.1", 0), ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5", 0)]
324 ("tier 3.2", 0), ("tier 4.1", 0), ("tier 4.2", 0), ("tier 5.1", 0), ("tier 5.2", 0), ("tier 5.3", 0), ("tier 5.4", 0), ("tier 5.5", 0),
325 ("tier 6", 0)]
326 for k, v in values_tier_dict: 279 for k, v in values_tier_dict:
327 tier_dict[key1][k] = v 280 tier_dict[key1][k] = v
328 281
329 used_keys = [] 282 used_keys = []
330 if 'ab' in mut_pos_dict[key1].keys(): 283 if 'ab' in mut_pos_dict[key1].keys():
349 if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()): 302 if (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key2[:-5] not in used_keys) and (key1 in tag_dict[key2[:-5]].keys()):
350 if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): 303 if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
351 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values()) 304 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
352 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): 305 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
353 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na'] 306 na1 = mut_dict[key1][key2[:-5] + '.ab.1']['na']
354 # na1f = na1/total1 307 else:
355 else:
356 # na1 = na1f = 0
357 na1 = 0 308 na1 = 0
358 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): 309 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
359 lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ'] 310 lowq1 = mut_dict[key1][key2[:-5] + '.ab.1']['lowQ']
360 # lowq1f = lowq1 / total1 311 else:
361 else:
362 # lowq1 = lowq1f = 0
363 lowq1 = 0 312 lowq1 = 0
364 if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys(): 313 if ref in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
365 ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref] 314 ref1 = mut_dict[key1][key2[:-5] + '.ab.1'][ref]
366 ref1f = ref1 / (total1 - na1 - lowq1) 315 ref1f = ref1 / (total1 - na1 - lowq1)
367 else: 316 else:
392 341
393 if key2[:-5] + '.ab.2' in mut_dict[key1].keys(): 342 if key2[:-5] + '.ab.2' in mut_dict[key1].keys():
394 total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values()) 343 total2 = sum(mut_dict[key1][key2[:-5] + '.ab.2'].values())
395 if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): 344 if 'na' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
396 na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na'] 345 na2 = mut_dict[key1][key2[:-5] + '.ab.2']['na']
397 # na2f = na2 / total2 346 else:
398 else:
399 # na2 = na2f = 0
400 na2 = 0 347 na2 = 0
401 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): 348 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
402 lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ'] 349 lowq2 = mut_dict[key1][key2[:-5] + '.ab.2']['lowQ']
403 # lowq2f = lowq2 / total2 350 else:
404 else:
405 # lowq2 = lowq2f = 0
406 lowq2 = 0 351 lowq2 = 0
407 if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys(): 352 if ref in mut_dict[key1][key2[:-5] + '.ab.2'].keys():
408 ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref] 353 ref2 = mut_dict[key1][key2[:-5] + '.ab.2'][ref]
409 ref2f = ref2 / (total2 - na2 - lowq2) 354 ref2f = ref2 / (total2 - na2 - lowq2)
410 else: 355 else:
435 380
436 if key2[:-5] + '.ba.1' in mut_dict[key1].keys(): 381 if key2[:-5] + '.ba.1' in mut_dict[key1].keys():
437 total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values()) 382 total3 = sum(mut_dict[key1][key2[:-5] + '.ba.1'].values())
438 if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): 383 if 'na' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
439 na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na'] 384 na3 = mut_dict[key1][key2[:-5] + '.ba.1']['na']
440 # na3f = na3 / total3 385 else:
441 else:
442 # na3 = na3f = 0
443 na3 = 0 386 na3 = 0
444 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): 387 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
445 lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ'] 388 lowq3 = mut_dict[key1][key2[:-5] + '.ba.1']['lowQ']
446 # lowq3f = lowq3 / total3 389 else:
447 else:
448 # lowq3 = lowq3f = 0
449 lowq3 = 0 390 lowq3 = 0
450 if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys(): 391 if ref in mut_dict[key1][key2[:-5] + '.ba.1'].keys():
451 ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref] 392 ref3 = mut_dict[key1][key2[:-5] + '.ba.1'][ref]
452 ref3f = ref3 / (total3 - na3 - lowq3) 393 ref3f = ref3 / (total3 - na3 - lowq3)
453 else: 394 else:
474 415
475 if key2[:-5] + '.ba.2' in mut_dict[key1].keys(): 416 if key2[:-5] + '.ba.2' in mut_dict[key1].keys():
476 total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values()) 417 total4 = sum(mut_dict[key1][key2[:-5] + '.ba.2'].values())
477 if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): 418 if 'na' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
478 na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na'] 419 na4 = mut_dict[key1][key2[:-5] + '.ba.2']['na']
479 # na4f = na4 / total4 420 else:
480 else:
481 # na4 = na4f = 0
482 na4 = 0 421 na4 = 0
483 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): 422 if 'lowQ' in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
484 lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ'] 423 lowq4 = mut_dict[key1][key2[:-5] + '.ba.2']['lowQ']
485 # lowq4f = lowq4 / total4 424 else:
486 else:
487 # lowq4 = lowq4f = 0
488 lowq4 = 0 425 lowq4 = 0
489 if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys(): 426 if ref in mut_dict[key1][key2[:-5] + '.ba.2'].keys():
490 ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref] 427 ref4 = mut_dict[key1][key2[:-5] + '.ba.2'][ref]
491 ref4f = ref4 / (total4 - na4 - lowq4) 428 ref4f = ref4 / (total4 - na4 - lowq4)
492 else: 429 else:
511 total4 = total4new = na4 = lowq4 = 0 448 total4 = total4new = na4 = lowq4 = 0
512 ref4 = alt4 = ref4f = alt4f = 0 449 ref4 = alt4 = ref4f = alt4f = 0
513 450
514 read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1 451 read_pos1 = read_pos2 = read_pos3 = read_pos4 = -1
515 read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0 452 read_len_median1 = read_len_median2 = read_len_median3 = read_len_median4 = 0
516 cigars_dcs1 = cigars_dcs2 = cigars_dcs3 = cigars_dcs4 = [] 453
517 pos_read1 = pos_read2 = pos_read3 = pos_read4 = []
518 end_read1 = end_read2 = end_read3 = end_read4 = []
519 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): 454 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys():
520 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) 455 read_pos1 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])
521 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) 456 read_len_median1 = np.median(reads_dict[key1][key2[:-5] + '.ab.1'])
522 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']
523 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'])
524 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1']
525 #print(cigars_dcs1)
526 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1']
527 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): 457 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys():
528 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) 458 read_pos2 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])
529 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) 459 read_len_median2 = np.median(reads_dict[key1][key2[:-5] + '.ab.2'])
530 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2']
531 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2']
532 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2']
533 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): 460 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys():
534 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) 461 read_pos3 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])
535 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) 462 read_len_median3 = np.median(reads_dict[key1][key2[:-5] + '.ba.1'])
536 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1']
537 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1']
538 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1']
539 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): 463 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys():
540 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) 464 read_pos4 = np.median(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])
541 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) 465 read_len_median4 = np.median(reads_dict[key1][key2[:-5] + '.ba.2'])
542 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'])
543 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
544
545 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
546 #print(cigars_dcs4)
547 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
548 466
549 used_keys.append(key2[:-5]) 467 used_keys.append(key2[:-5])
550 counts_mut += 1 468 counts_mut += 1
551 if (alt1f + alt2f + alt3f + alt4f) > 0.5: 469 if (alt1f + alt2f + alt3f + alt4f) > 0.5:
552 if total1new == 0: 470 if total1new == 0:
575 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 493 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
576 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 494 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
577 495
578 trimmed = False 496 trimmed = False
579 contradictory = False 497 contradictory = False
580 softclipped_mutation_allMates = False 498
581 softclipped_mutation_oneOfTwoMates = False 499 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
582 softclipped_mutation_oneOfTwoSSCS = False
583 softclipped_mutation_oneMate = False
584 softclipped_mutation_oneMateOneSSCS = False
585 print()
586 print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3)
587 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = []
588 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = []
589 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False
590 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False
591
592 # mate 1 - SSCS ab
593 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1]
594 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads
595
596 if any(ij is True for ij in softclipped_idx1):
597 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1]
598 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1]
599 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1]
600 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)]
601 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)]
602
603 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
604 if any(ij is True for ij in softclipped_both_ends_idx1):
605 print(softclipped_both_ends_idx1)
606 for nr, indx in enumerate(softclipped_both_ends_idx1):
607 if indx:
608 if dist_start_read1[nr] <= dist_end_read1[nr]:
609 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number
610 else:
611 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number
612 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads
613 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads
614 print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1)
615
616 # mate 1 - SSCS ba
617 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4]
618 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads
619 if any(ij is True for ij in softclipped_idx4):
620 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4]
621 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4]
622 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4]
623 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)]
624 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)]
625
626 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
627 if any(ij is True for ij in softclipped_both_ends_idx4):
628 print(softclipped_both_ends_idx4)
629 for nr, indx in enumerate(softclipped_both_ends_idx4):
630 if indx:
631 if dist_start_read4[nr] <= dist_end_read4[nr]:
632 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number
633 else:
634 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number
635 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads
636 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads
637 print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4)
638
639 # mate 2 - SSCS ab
640 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2]
641 #print(sum(softclipped_idx2))
642 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads
643 if any(ij is True for ij in softclipped_idx2):
644 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2]
645 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2]
646 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2]
647 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)]
648 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)]
649
650 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
651 if any(ij is True for ij in softclipped_both_ends_idx2):
652 print(softclipped_both_ends_idx2)
653 for nr, indx in enumerate(softclipped_both_ends_idx2):
654 if indx:
655 if dist_start_read2[nr] <= dist_end_read2[nr]:
656 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number
657 else:
658 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number
659 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads
660 #print(ratio_dist_end2)
661 #print([True if x <= thr else False for x in ratio_dist_end2])
662 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads
663 print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2)
664
665 # mate 2 - SSCS ba
666 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3]
667 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads
668 if any(ij is True for ij in softclipped_idx3):
669 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3]
670 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3]
671 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3]
672 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)]
673 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)]
674
675 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping
676 if any(ij is True for ij in softclipped_both_ends_idx3):
677 print(softclipped_both_ends_idx3)
678 for nr, indx in enumerate(softclipped_both_ends_idx3):
679 if indx:
680 if dist_start_read3[nr] <= dist_end_read3[nr]:
681 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh
682 else:
683 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number
684 #print([True if x <= thr else False for x in dist_start_read3])
685 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads
686 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads
687 print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3)
688
689 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant
690 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) |
691 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) &
692 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))):
693 alt1ff = 0 500 alt1ff = 0
694 alt4ff = 0 501 alt4ff = 0
695 alt2ff = 0 502 alt2ff = 0
696 alt3ff = 0 503 alt3ff = 0
697 trimmed = False 504 trimmed = False
698 contradictory = True 505 contradictory = True
699 # softclipping tiers
700 # information of both mates available --> all reads for both mates and SSCS are softclipped
701 elif (ratio1 & ratio4 & ratio2 & ratio3 &
702 (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
703 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
704 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
705 softclipped_mutation_allMates = True
706 softclipped_mutation_oneOfTwoMates = False
707 softclipped_mutation_oneOfTwoSSCS = False
708 softclipped_mutation_oneMate = False
709 softclipped_mutation_oneMateOneSSCS = False
710 alt1ff = 0
711 alt4ff = 0
712 alt2ff = 0
713 alt3ff = 0
714 trimmed = False
715 contradictory = False
716 print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates)
717 # information of both mates available --> only one mate softclipped
718 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) |
719 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) &
720 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
721 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
722 softclipped_mutation_allMates = False
723 softclipped_mutation_oneOfTwoMates = True
724 softclipped_mutation_oneOfTwoSSCS = False
725 softclipped_mutation_oneMate = False
726 softclipped_mutation_oneMateOneSSCS = False
727 alt1ff = 0
728 alt4ff = 0
729 alt2ff = 0
730 alt3ff = 0
731 trimmed = False
732 contradictory = False
733 print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates)
734 # information of both mates available --> only one mate softclipped
735 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
736 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
737 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available
738 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
739 softclipped_mutation_allMates = False
740 softclipped_mutation_oneOfTwoMates = False
741 softclipped_mutation_oneOfTwoSSCS = True
742 softclipped_mutation_oneMate = False
743 softclipped_mutation_oneMateOneSSCS = False
744 alt1ff = 0
745 alt4ff = 0
746 alt2ff = 0
747 alt3ff = 0
748 trimmed = False
749 contradictory = False
750 print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff])
751 # information of one mate available --> all reads of one mate are softclipped
752 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) &
753 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) |
754 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) &
755 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available
756 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
757 #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) &
758 # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) |
759 # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) &
760 # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))):
761 softclipped_mutation_allMates = False
762 softclipped_mutation_oneOfTwoMates = False
763 softclipped_mutation_oneOfTwoSSCS = False
764 softclipped_mutation_oneMate = True
765 softclipped_mutation_oneMateOneSSCS = False
766 alt1ff = 0
767 alt4ff = 0
768 alt2ff = 0
769 alt3ff = 0
770 trimmed = False
771 contradictory = False
772 print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate)
773 # information of one mate available --> only one SSCS is softclipped
774 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) &
775 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) |
776 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) &
777 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available
778 # if distance between softclipping and mutation is at start or end of the read smaller than threshold
779 #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) |
780 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) |
781 # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) |
782 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))):
783 softclipped_mutation_allMates = False
784 softclipped_mutation_oneOfTwoMates = False
785 softclipped_mutation_oneOfTwoSSCS = False
786 softclipped_mutation_oneMate = False
787 softclipped_mutation_oneMateOneSSCS = True
788 alt1ff = 0
789 alt4ff = 0
790 alt2ff = 0
791 alt3ff = 0
792 trimmed = False
793 contradictory = False
794 print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS)
795
796 else: 506 else:
797 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): 507 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))):
798 beg1 = total1new 508 beg1 = total1new
799 total1new = 0 509 total1new = 0
800 alt1ff = 0 510 alt1ff = 0
823 trimmed = True 533 trimmed = True
824 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) 534 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4)
825 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) 535 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3)
826 536
827 # assign tiers 537 # assign tiers
828 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & 538 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | (all(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
829 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
830 (all(int(ij) >= 3 for ij in [total2new, total3new]) &
831 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
832 tier = "1.1" 539 tier = "1.1"
833 counter_tier11 += 1 540 counter_tier11 += 1
834 tier_dict[key1]["tier 1.1"] += 1 541 tier_dict[key1]["tier 1.1"] += 1
835 542
836 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & 543 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new])
837 any(int(ij) >= 3 for ij in [total1new, total4new]) & 544 & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
838 any(int(ij) >= 3 for ij in [total2new, total3new]) &
839 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
840 tier = "1.2" 545 tier = "1.2"
841 counter_tier12 += 1 546 counter_tier12 += 1
842 tier_dict[key1]["tier 1.2"] += 1 547 tier_dict[key1]["tier 1.2"] += 1
843 548
844 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & 549 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))
845 any(int(ij) >= 3 for ij in [total1new, total4new]) & 550 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
846 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) |
847 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
848 any(int(ij) >= 3 for ij in [total2new, total3new]) &
849 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
850 tier = "2.1" 551 tier = "2.1"
851 counter_tier21 += 1 552 counter_tier21 += 1
852 tier_dict[key1]["tier 2.1"] += 1 553 tier_dict[key1]["tier 2.1"] += 1
853 554
854 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & 555 elif (all(int(ij) >= 1 for ij in [total1new, total2new, total3new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
855 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])):
856 tier = "2.2" 556 tier = "2.2"
857 counter_tier22 += 1 557 counter_tier22 += 1
858 tier_dict[key1]["tier 2.2"] += 1 558 tier_dict[key1]["tier 2.2"] += 1
859 559
860 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & 560 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & any(int(ij) >= 3 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) & any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))
861 any(int(ij) >= 3 for ij in [total2new, total3new]) & 561 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & any(int(ij) >= 3 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) & any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
862 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]) &
863 any(float(ij) >= 0.75 for ij in [alt2ff, alt3ff])) |
864 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
865 any(int(ij) >= 3 for ij in [total1new, total4new]) &
866 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]) &
867 any(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))):
868 tier = "2.3" 562 tier = "2.3"
869 counter_tier23 += 1 563 counter_tier23 += 1
870 tier_dict[key1]["tier 2.3"] += 1 564 tier_dict[key1]["tier 2.3"] += 1
871 565
872 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & 566 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff]))
873 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | 567 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
874 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
875 all(float(ij) >= 0.75 for ij in [alt2ff, alt3ff]))):
876 tier = "2.4" 568 tier = "2.4"
877 counter_tier24 += 1 569 counter_tier24 += 1
878 tier_dict[key1]["tier 2.4"] += 1 570 tier_dict[key1]["tier 2.4"] += 1
879 571
880 elif ((len(pure_tags_dict_short[key1]) > 1) & 572 elif ((len(pure_tags_dict_short[key1]) > 1) & (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) | all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
881 (all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) |
882 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
883 tier = "3.1" 573 tier = "3.1"
884 counter_tier31 += 1 574 counter_tier31 += 1
885 tier_dict[key1]["tier 3.1"] += 1 575 tier_dict[key1]["tier 3.1"] += 1
886 576
887 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & 577 elif ((all(int(ij) >= 1 for ij in [total1new, total4new]) & all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]))
888 all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff])) | 578 | (all(int(ij) >= 1 for ij in [total2new, total3new]) & all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
889 (all(int(ij) >= 1 for ij in [total2new, total3new]) &
890 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))):
891 tier = "3.2" 579 tier = "3.2"
892 counter_tier32 += 1 580 counter_tier32 += 1
893 tier_dict[key1]["tier 3.2"] += 1 581 tier_dict[key1]["tier 3.2"] += 1
894 582
895 elif (trimmed): 583 elif (trimmed):
900 elif (contradictory): 588 elif (contradictory):
901 tier = "4.2" 589 tier = "4.2"
902 counter_tier42 += 1 590 counter_tier42 += 1
903 tier_dict[key1]["tier 4.2"] += 1 591 tier_dict[key1]["tier 4.2"] += 1
904 592
905 elif softclipped_mutation_allMates: 593 else:
906 tier = "5.1" 594 tier = "5"
907 counter_tier51 += 1 595 counter_tier5 += 1
908 tier_dict[key1]["tier 5.1"] += 1 596 tier_dict[key1]["tier 5"] += 1
909
910 elif softclipped_mutation_oneOfTwoMates:
911 tier = "5.2"
912 counter_tier52 += 1
913 tier_dict[key1]["tier 5.2"] += 1
914
915 elif softclipped_mutation_oneOfTwoSSCS:
916 tier = "5.3"
917 counter_tier53 += 1
918 tier_dict[key1]["tier 5.3"] += 1
919
920 elif softclipped_mutation_oneMate:
921 tier = "5.4"
922 counter_tier54 += 1
923 tier_dict[key1]["tier 5.4"] += 1
924
925 elif softclipped_mutation_oneMateOneSSCS:
926 tier = "5.5"
927 counter_tier55 += 1
928 tier_dict[key1]["tier 5.5"] += 1
929
930 else:
931 tier = "6"
932 counter_tier6 += 1
933 tier_dict[key1]["tier 6"] += 1
934 597
935 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) 598 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
936 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) 599 var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
937 sample_tag = key2[:-5] 600 sample_tag = key2[:-5]
938 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time 601 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
939 # exclude identical tag from array2, to prevent comparison to itself 602 # exclude identical tag from array2, to prevent comparison to itself
940 same_tag = np.where(array2 == sample_tag) 603 same_tag = np.where(array2 == sample_tag)
941 index_array2 = np.arange(0, len(array2), 1) 604 index_array2 = np.arange(0, len(array2), 1)
960 half1_mate1 = array1_half2 623 half1_mate1 = array1_half2
961 half2_mate1 = array1_half 624 half2_mate1 = array1_half
962 half1_mate2 = array2_half2 625 half1_mate2 = array2_half2
963 half2_mate2 = array2_half 626 half2_mate2 = array2_half
964 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" 627 # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
965 dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2]) 628 dist = np.array([sum(map(operator.ne, half1_mate1, c)) for c in half1_mate2])
966 min_index = np.where(dist == dist.min()) # get index of min HD 629 min_index = np.where(dist == dist.min()) # get index of min HD
967 # get all "b's" of the tag or all "a's" of the tag with minimum HD 630 # get all "b's" of the tag or all "a's" of the tag with minimum HD
968 min_tag_half2 = half2_mate2[min_index] 631 min_tag_half2 = half2_mate2[min_index]
969 min_tag_array2 = array2[min_index] # get whole tag with min HD 632 min_tag_array2 = array2[min_index] # get whole tag with min HD
970 min_value = dist.min() 633 min_value = dist.min()
971 # calculate HD of "b" to all "b's" or "a" to all "a's" 634 # calculate HD of "b" to all "b's" or "a" to all "a's"
972 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) 635 dist_second_half = np.array([sum(map(operator.ne, half2_mate1, e))
973 for e in min_tag_half2]) 636 for e in min_tag_half2])
974 637
975 dist2 = dist_second_half.max() 638 dist2 = dist_second_half.max()
976 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD 639 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD
977 max_tag = min_tag_array2[max_index] 640 max_tag = min_tag_array2[max_index]
978 641
979 # tags which have identical parts: 642 # tags which have identical parts:
980 if min_value == 0 or dist2 == 0: 643 if min_value == 0 or dist2 == 0:
981 min_tags_list_zeros.append(tag) 644 min_tags_list_zeros.append(tag)
982 chimera_tags.append(max_tag) 645 chimera_tags.append(max_tag)
983 # chimeric = True 646
984 # else:
985 # chimeric = False
986
987 # if mate_b is False:
988 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric)
989 # else:
990 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric)
991 i += 1 647 i += 1
992 chimera_tags = [x for x in chimera_tags if x != []] 648 chimera_tags = [x for x in chimera_tags if x != []]
993 chimera_tags_new = [] 649 chimera_tags_new = []
994 for i in chimera_tags: 650 for i in chimera_tags:
995 if len(i) > 1: 651 if len(i) > 1:
1025 681
1026 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), 682 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
1027 {'type': 'formula', 683 {'type': 'formula',
1028 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 684 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
1029 'format': format1, 685 'format': format1,
1030 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) 686 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
1031 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), 687 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
1032 {'type': 'formula', 688 {'type': 'formula',
1033 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1), 689 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(row + 1, row + 1, row + 1, row + 1),
1034 'format': format3, 690 'format': format3,
1035 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) 691 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
1036 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), 692 ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
1037 {'type': 'formula', 693 {'type': 'formula',
1038 'criteria': '=$B${}>="3"'.format(row + 1), 694 'criteria': '=$B${}>="3"'.format(row + 1),
1039 'format': format2, 695 'format': format2,
1040 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) 696 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1)})
1041
1042 row += 3 697 row += 3
698
1043 if chimera_correction: 699 if chimera_correction:
1044 chimeric_dcs_high_tiers = 0 700 chimeric_dcs_high_tiers = 0
1045 chimeric_dcs = 0 701 chimeric_dcs = 0
1046 for keys_chimera in chimeric_tag.keys(): 702 for keys_chimera in chimeric_tag.keys():
1047 tiers = chimeric_tag[keys_chimera] 703 tiers = chimeric_tag[keys_chimera]
1050 if high_tiers == len(tiers): 706 if high_tiers == len(tiers):
1051 chimeric_dcs_high_tiers += high_tiers - 1 707 chimeric_dcs_high_tiers += high_tiers - 1
1052 else: 708 else:
1053 chimeric_dcs_high_tiers += high_tiers 709 chimeric_dcs_high_tiers += high_tiers
1054 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) 710 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers)
711
1055 # sheet 2 712 # sheet 2
1056 if chimera_correction: 713 if chimera_correction:
1057 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 714 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'chimeras in AC alt (all tiers)', 'chimera-corrected cvrg', 'chimera-corrected AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'chimeras in AC alt (tiers 1.1-2.4)', 'chimera-corrected cvrg (tiers 1.1-2.4)', 'chimera-corrected AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
1058 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 715 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
1059 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', 716 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
1060 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6') 717 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
1061 else: 718 else:
1062 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)', 719 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.4)', 'AC alt (tiers 1.1-2.4)', 'AF (tiers 1.1-2.4)', 'AC alt (orginal DCS)', 'AF (original DCS)',
1063 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4', 720 'tier 1.1', 'tier 1.2', 'tier 2.1', 'tier 2.2', 'tier 2.3', 'tier 2.4',
1064 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5.1', 'tier 5.2', 'tier 5.3', 'tier 5.4', 'tier 5.5', 'tier 6', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2', 721 'tier 3.1', 'tier 3.2', 'tier 4.1', 'tier 4.2', 'tier 5', 'AF 1.1-1.2', 'AF 1.1-2.1', 'AF 1.1-2.2',
1065 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5.1', 'AF 1.1-5.2', 'AF 1.1-5.3', 'AF 1.1-5.4', 'AF 1.1-5.5', 'AF 1.1-6') 722 'AF 1.1-2.3', 'AF 1.1-2.4', 'AF 1.1-3.1', 'AF 1.1-3.2', 'AF 1.1-4.1', 'AF 1.1-4.2', 'AF 1.1-5')
1066 723
1067 ws2.write_row(0, 0, header_line2) 724 ws2.write_row(0, 0, header_line2)
1068 row = 0 725 row = 0
1069 726
1070 for key1, value1 in sorted(tier_dict.items()): 727 for key1, value1 in sorted(tier_dict.items()):
1076 chrom, pos, ref_a, alt_a = re.split(r'\#', key1) 733 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
1077 ref_count = cvrg_dict[key1][0] 734 ref_count = cvrg_dict[key1][0]
1078 alt_count = cvrg_dict[key1][1] 735 alt_count = cvrg_dict[key1][1]
1079 cvrg = ref_count + alt_count 736 cvrg = ref_count + alt_count
1080 737
1081 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) 738 var_id = '-'.join([chrom, str(int(pos) + 1), ref, alt])
1082 lst = [var_id, cvrg] 739 lst = [var_id, cvrg]
1083 used_tiers = [] 740 used_tiers = []
1084 cum_af = [] 741 cum_af = []
1085 for key2, value2 in sorted(value1.items()): 742 for key2, value2 in sorted(value1.items()):
1086 # calculate cummulative AF 743 # calculate cummulative AF
1087 used_tiers.append(value2) 744 used_tiers.append(value2)
1088 if len(used_tiers) > 1: 745 if len(used_tiers) > 1:
1089 cum = safe_div(sum(used_tiers), cvrg) 746 cum = safe_div(sum(used_tiers), cvrg)
1090 cum_af.append(cum) 747 cum_af.append(cum)
1091 if sum(used_tiers) == 0: # skip mutations that are filtered by the VA in the first place
1092 continue
1093 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)]) 748 lst.extend([sum(used_tiers), safe_div(sum(used_tiers), cvrg)])
1094 if chimera_correction: 749 if chimera_correction:
1095 chimeras_all = chimera_dict[key1][0] 750 chimeras_all = chimera_dict[key1][0]
1096 new_alt = sum(used_tiers) - chimeras_all 751 new_alt = sum(used_tiers) - chimeras_all
1097 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers))) 752 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers)))
1112 lst.extend(used_tiers) 767 lst.extend(used_tiers)
1113 lst.extend(cum_af) 768 lst.extend(cum_af)
1114 lst = tuple(lst) 769 lst = tuple(lst)
1115 ws2.write_row(row + 1, 0, lst) 770 ws2.write_row(row + 1, 0, lst)
1116 if chimera_correction: 771 if chimera_correction:
1117 ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format12, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)}) 772 ws2.conditional_format('P{}:Q{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 1.1"', 'format': format1, 'multi_range': 'P{}:Q{} P1:Q1'.format(row + 2, row + 2)})
1118 ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format32, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)}) 773 ws2.conditional_format('R{}:U{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$R$1="tier 2.1"', 'format': format3, 'multi_range': 'R{}:U{} R1:U1'.format(row + 2, row + 2)})
1119 ws2.conditional_format('V{}:AE{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format22, 'multi_range': 'V{}:AE{} V1:AE1'.format(row + 2, row + 2)}) 774 ws2.conditional_format('V{}:Z{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$V$1="tier 3.1"', 'format': format2, 'multi_range': 'V{}:Z{} V1:Z1'.format(row + 2, row + 2)})
1120 else: 775 else:
1121 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format12, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)}) 776 ws2.conditional_format('J{}:K{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$J$1="tier 1.1"', 'format': format1, 'multi_range': 'J{}:K{} J1:K1'.format(row + 2, row + 2)})
1122 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format32, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)}) 777 ws2.conditional_format('L{}:O{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$L$1="tier 2.1"', 'format': format3, 'multi_range': 'L{}:O{} L1:O1'.format(row + 2, row + 2)})
1123 ws2.conditional_format('P{}:Y{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format22, 'multi_range': 'P{}:Y{} P1:Y1'.format(row + 2, row + 2)}) 778 ws2.conditional_format('P{}:T{}'.format(row + 2, row + 2), {'type': 'formula', 'criteria': '=$P$1="tier 3.1"', 'format': format2, 'multi_range': 'P{}:T{} P1:T1'.format(row + 2, row + 2)})
1124 row += 1 779 row += 1
1125 780
1126 # sheet 3 781 # sheet 3
1127 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21), 782 sheet3 = [("tier 1.1", counter_tier11), ("tier 1.2", counter_tier12), ("tier 2.1", counter_tier21),
1128 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24), 783 ("tier 2.2", counter_tier22), ("tier 2.3", counter_tier23), ("tier 2.4", counter_tier24),
1129 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41), 784 ("tier 3.1", counter_tier31), ("tier 3.2", counter_tier32), ("tier 4.1", counter_tier41),
1130 ("tier 4.2", counter_tier42), ("tier 5.1", counter_tier51), ("tier 5.2", counter_tier52), 785 ("tier 4.2", counter_tier42), ("tier 5", counter_tier5)]
1131 ("tier 5.3", counter_tier53), ("tier 5.4", counter_tier54), ("tier 5.5", counter_tier55), ("tier 6", counter_tier6)]
1132 786
1133 header = ("tier", "count") 787 header = ("tier", "count")
1134 ws3.write_row(0, 0, header) 788 ws3.write_row(0, 0, header)
1135 789
1136 for i in range(len(sheet3)): 790 for i in range(len(sheet3)):
1146 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2), 800 ws3.conditional_format('A{}:B{}'.format(i + 2, i + 2),
1147 {'type': 'formula', 801 {'type': 'formula',
1148 'criteria': '=$A${}>="3"'.format(i + 2), 802 'criteria': '=$A${}>="3"'.format(i + 2),
1149 'format': format2}) 803 'format': format2})
1150 804
1151 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), 805 description_tiers = [("Tier 1.1", "both ab and ba SSCS present (>75% of the sites with alternative base) and minimal FS>=3 for both SSCS in at least one mate"), ("", ""), ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), ("Tier 5", "remaining variants")]
1152 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"),
1153 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"),
1154 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"),
1155 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
1156 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
1157 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
1158 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
1159 ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"),
1160 ("Tier 5.1", "variant is close to softclipping in both mates"),
1161 ("Tier 5.2", "variant is close to softclipping in one of the mates"),
1162 ("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"),
1163 ("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"),
1164 ("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"),
1165 ("Tier 6", "remaining variants")]
1166 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", 806 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
1167 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 807 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
1168 "4081", "4098", "5", "10", "", ""), 808 "4081", "4098", "5", "10", "", ""),
1169 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, 809 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None,
1170 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, 810 "289", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None,
1226 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0", 866 "110", "54", "110", "54", "0", "0", "110", "54", "0", "0", "1", "1", "0", "0", "0",
1227 "0", "0", "0", "1", "1", "5348", "5350", "", ""), 867 "0", "0", "0", "1", "1", "5348", "5350", "", ""),
1228 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263", 868 ("", "", "TTTTTAAGAATAACCCACAC", "ab2.ba1", "100", "112", "140", "145", "263",
1229 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0", 869 "7", "12", "7", "12", "7", "12", "0", "0", "1", "1", "0",
1230 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")], 870 "0", "0", "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")],
1231 [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], [("" * 34), ("" * 34)], 871 [("Chr5:5-20000-13983-G-C", "5", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
1232 [("Chr5:5-20000-13983-G-C", "6", "ATGTTGTGAATAACCCACAC", "ab1.ba2", None, "186", None, "276", "269",
1233 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0", 872 "0", "6", "0", "6", "0", "0", "0", "6", "0", "0", "0", "1", "0", "0", "0", "0", "0",
1234 "0", "1", "1", "5348", "5350", "", ""), 873 "0", "1", "1", "5348", "5350", "", ""),
1235 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None, 874 ("", "", "ATGTTGTGAATAACCCACAC", "ab2.ba1", None, None, None, None,
1236 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0", 875 "269", "0", "0", "0", "0", "0", "0", "0", "0", None, None, None, None, "0",
1237 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]] 876 "0", "0", "0", "0", "0", "1", "1", "5348", "5350", "", "")]]
1238 877
1239 start_row = 20 878 start_row = 15
1240 ws3.write(start_row, 0, "Description of tiers with examples") 879 ws3.write(start_row, 0, "Description of tiers with examples")
1241 ws3.write_row(start_row + 1, 0, header_line) 880 ws3.write_row(start_row + 1, 0, header_line)
1242 row = 0 881 row = 0
1243 for i in range(len(description_tiers)): 882 for i in range(len(description_tiers)):
1244 ws3.write_row(start_row + 2 + row + i + 1, 0, description_tiers[i]) 883 ws3.write_row(start_row + 2 + row + i + 1, 0, description_tiers[i])
1245 ex = examples_tiers[i] 884 ex = examples_tiers[i]
1246 for k in range(len(ex)): 885 for k in range(len(ex)):
1247 ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k]) 886 ws3.write_row(start_row + 2 + row + i + k + 2, 0, ex[k])
1248 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format13, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)}) 887 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 'format': format1, 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
1249 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), 888 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
1250 {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2), 889 {'type': 'formula', 'criteria': '=OR($B${}="2.1",$B${}="2.2", $B${}="2.3", $B${}="2.4")'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 2),
1251 'format': format33, 890 'format': format3,
1252 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)}) 891 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
1253 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3), 892 ws3.conditional_format('L{}:M{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3),
1254 {'type': 'formula', 893 {'type': 'formula',
1255 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2), 894 'criteria': '=$B${}>="3"'.format(start_row + 2 + row + i + k + 2),
1256 'format': format23, 895 'format': format2,
1257 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3)}) 896 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2, start_row + 2 + row + i + k + 3, start_row + 2 + row + i + k + 2)})
1258 row += 3 897 row += 3
1259 workbook.close() 898 workbook.close()
1260 workbook2.close() 899 workbook2.close()
1261 workbook3.close() 900 workbook3.close()
1262 901
1263 902
1264 if __name__ == '__main__': 903 if __name__ == '__main__':
1265 sys.exit(read2mut(sys.argv)) 904 sys.exit(read2mut(sys.argv))
1266