Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 63:f0fc93b7945c draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author | mheinzl |
---|---|
date | Thu, 18 Mar 2021 10:07:50 +0000 |
parents | 66c1245436b9 |
children | fd342f5a97d9 |
comparison
equal
deleted
inserted
replaced
62:66c1245436b9 | 63:f0fc93b7945c |
---|---|
8 Looks for reads with mutation at known | 8 Looks for reads with mutation at known |
9 positions and calculates frequencies and stats. | 9 positions and calculates frequencies and stats. |
10 | 10 |
11 ======= ========== ================= ================================ | 11 ======= ========== ================= ================================ |
12 Version Date Author Description | 12 Version Date Author Description |
13 0.2.1 2019-10-27 Gundula Povysil - | 13 0.2.2 2019-10-27 Gundula Povysil - |
14 ======= ========== ================= ================================ | 14 ======= ========== ================= ================================ |
15 | 15 |
16 | 16 |
17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam | 17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam |
18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json | 18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json |
267 if len(value) < thresh: | 267 if len(value) < thresh: |
268 pure_tags_dict_short[key] = value | 268 pure_tags_dict_short[key] = value |
269 else: | 269 else: |
270 pure_tags_dict_short = pure_tags_dict | 270 pure_tags_dict_short = pure_tags_dict |
271 | 271 |
272 # whole_array = [] | |
273 # for k in pure_tags_dict.values(): | |
274 # if len(k) != 0: | |
275 # keys = k.keys() | |
276 # if len(keys) > 1: | |
277 # for k1 in keys: | |
278 # whole_array.append(k1) | |
279 # else: | |
280 # whole_array.append(keys[0]) | |
281 | |
282 csv_data = open(outputFile_csv, "wb") | 272 csv_data = open(outputFile_csv, "wb") |
283 csv_writer = csv.writer(csv_data, delimiter=",") | 273 csv_writer = csv.writer(csv_data, delimiter=",") |
284 | 274 |
285 # output summary with threshold | 275 # output summary with threshold |
286 workbook = xlsxwriter.Workbook(outfile) | 276 workbook = xlsxwriter.Workbook(outfile) |
319 counter_tier24 = 0 | 309 counter_tier24 = 0 |
320 counter_tier31 = 0 | 310 counter_tier31 = 0 |
321 counter_tier32 = 0 | 311 counter_tier32 = 0 |
322 counter_tier25 = 0 | 312 counter_tier25 = 0 |
323 counter_tier4 = 0 | 313 counter_tier4 = 0 |
324 # if chimera_correction: | |
325 # counter_tier43 = 0 | |
326 counter_tier51 = 0 | 314 counter_tier51 = 0 |
327 counter_tier52 = 0 | 315 counter_tier52 = 0 |
328 counter_tier53 = 0 | 316 counter_tier53 = 0 |
329 counter_tier54 = 0 | 317 counter_tier54 = 0 |
330 counter_tier55 = 0 | 318 counter_tier55 = 0 |
332 counter_tier7 = 0 | 320 counter_tier7 = 0 |
333 | 321 |
334 row = 1 | 322 row = 1 |
335 tier_dict = {} | 323 tier_dict = {} |
336 chimera_dict = {} | 324 chimera_dict = {} |
337 #change_tier_after_print = {} | |
338 for key1, value1 in sorted(mut_dict.items()): | 325 for key1, value1 in sorted(mut_dict.items()): |
339 counts_mut = 0 | 326 counts_mut = 0 |
340 chimeric_tag_list = [] | 327 chimeric_tag_list = [] |
341 chimeric_tag = {} | 328 chimeric_tag = {} |
342 if key1 in pure_tags_dict_short.keys(): | 329 if key1 in pure_tags_dict_short.keys(): |
343 change_tier_after_print = [] | 330 change_tier_after_print = [] |
344 | |
345 i = np.where(np.array(['#'.join(str(i) for i in z) | 331 i = np.where(np.array(['#'.join(str(i) for i in z) |
346 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] | 332 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
347 ref = mut_array[i, 2] | 333 ref = mut_array[i, 2] |
348 alt = mut_array[i, 3] | 334 alt = mut_array[i, 3] |
349 dcs_median = cvrg_dict[key1][2] | 335 dcs_median = cvrg_dict[key1][2] |
638 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] | 624 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] |
639 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] | 625 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] |
640 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] | 626 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] |
641 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 627 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
642 if any(ij is True for ij in softclipped_both_ends_idx1): | 628 if any(ij is True for ij in softclipped_both_ends_idx1): |
643 print(softclipped_both_ends_idx1) | |
644 for nr, indx in enumerate(softclipped_both_ends_idx1): | 629 for nr, indx in enumerate(softclipped_both_ends_idx1): |
645 if indx: | 630 if indx: |
646 if dist_start_read1[nr] <= dist_end_read1[nr]: | 631 if dist_start_read1[nr] <= dist_end_read1[nr]: |
647 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number | 632 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number |
648 else: | 633 else: |
659 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] | 644 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] |
660 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] | 645 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] |
661 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] | 646 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] |
662 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 647 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
663 if any(ij is True for ij in softclipped_both_ends_idx4): | 648 if any(ij is True for ij in softclipped_both_ends_idx4): |
664 print(softclipped_both_ends_idx4) | |
665 for nr, indx in enumerate(softclipped_both_ends_idx4): | 649 for nr, indx in enumerate(softclipped_both_ends_idx4): |
666 if indx: | 650 if indx: |
667 if dist_start_read4[nr] <= dist_end_read4[nr]: | 651 if dist_start_read4[nr] <= dist_end_read4[nr]: |
668 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number | 652 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number |
669 else: | 653 else: |
680 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] | 664 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] |
681 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] | 665 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] |
682 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] | 666 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] |
683 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 667 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
684 if any(ij is True for ij in softclipped_both_ends_idx2): | 668 if any(ij is True for ij in softclipped_both_ends_idx2): |
685 print(softclipped_both_ends_idx2) | |
686 for nr, indx in enumerate(softclipped_both_ends_idx2): | 669 for nr, indx in enumerate(softclipped_both_ends_idx2): |
687 if indx: | 670 if indx: |
688 if dist_start_read2[nr] <= dist_end_read2[nr]: | 671 if dist_start_read2[nr] <= dist_end_read2[nr]: |
689 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number | 672 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number |
690 else: | 673 else: |
701 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] | 684 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] |
702 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] | 685 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] |
703 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] | 686 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] |
704 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 687 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
705 if any(ij is True for ij in softclipped_both_ends_idx3): | 688 if any(ij is True for ij in softclipped_both_ends_idx3): |
706 print(softclipped_both_ends_idx3) | |
707 for nr, indx in enumerate(softclipped_both_ends_idx3): | 689 for nr, indx in enumerate(softclipped_both_ends_idx3): |
708 if indx: | 690 if indx: |
709 if dist_start_read3[nr] <= dist_end_read3[nr]: | 691 if dist_start_read3[nr] <= dist_end_read3[nr]: |
710 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh | 692 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh |
711 else: | 693 else: |
1089 min_tag_array2 = array2[min_index] # get whole tag with min HD | 1071 min_tag_array2 = array2[min_index] # get whole tag with min HD |
1090 min_value = dist.min() | 1072 min_value = dist.min() |
1091 # calculate HD of "b" to all "b's" or "a" to all "a's" | 1073 # calculate HD of "b" to all "b's" or "a" to all "a's" |
1092 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) | 1074 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) |
1093 for e in min_tag_half2]) | 1075 for e in min_tag_half2]) |
1094 | |
1095 dist2 = dist_second_half.max() | 1076 dist2 = dist_second_half.max() |
1096 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | 1077 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD |
1097 max_tag = min_tag_array2[max_index] | 1078 max_tag = min_tag_array2[max_index] |
1098 | 1079 |
1099 # tags which have identical parts: | 1080 # tags which have identical parts: |
1168 else: | 1149 else: |
1169 chimeric_dcs_high_tiers += high_tiers | 1150 chimeric_dcs_high_tiers += high_tiers |
1170 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) | 1151 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) |
1171 | 1152 |
1172 # write to file | 1153 # write to file |
1173 | |
1174 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 | 1154 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 |
1175 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) | 1155 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) |
1176 | |
1177 correct_tier = False | 1156 correct_tier = False |
1178 | |
1179 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: | 1157 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: |
1180 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] | 1158 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] |
1181 tier_dict[key1]["tier 4"] = 0 | 1159 tier_dict[key1]["tier 4"] = 0 |
1182 correct_tier = True | 1160 correct_tier = True |
1183 | 1161 |
1315 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), | 1293 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), |
1316 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), | 1294 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), |
1317 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), | 1295 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), |
1318 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), | 1296 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), |
1319 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), | 1297 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), |
1320 ("Tier 2.5", "variants at the start or end of the read and recurring mutation on this position in tier 1.1-2.4"), | 1298 ("Tier 2.5", "variants at the start or end of the read (ignoring variant position tier 1.1-2.4) and recurring mutation on this position in tier 1.1-2.4"), |
1321 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), | 1299 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), |
1322 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), | 1300 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), |
1323 ("Tier 4", "variants at the start or end of the reads"), | 1301 ("Tier 4", "variants at the start or end of the reads"), |
1324 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"), | 1302 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"), |
1325 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"), | 1303 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"), |