Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 63:f0fc93b7945c draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
| author | mheinzl |
|---|---|
| date | Thu, 18 Mar 2021 10:07:50 +0000 |
| parents | 66c1245436b9 |
| children | fd342f5a97d9 |
comparison
equal
deleted
inserted
replaced
| 62:66c1245436b9 | 63:f0fc93b7945c |
|---|---|
| 8 Looks for reads with mutation at known | 8 Looks for reads with mutation at known |
| 9 positions and calculates frequencies and stats. | 9 positions and calculates frequencies and stats. |
| 10 | 10 |
| 11 ======= ========== ================= ================================ | 11 ======= ========== ================= ================================ |
| 12 Version Date Author Description | 12 Version Date Author Description |
| 13 0.2.1 2019-10-27 Gundula Povysil - | 13 0.2.2 2019-10-27 Gundula Povysil - |
| 14 ======= ========== ================= ================================ | 14 ======= ========== ================= ================================ |
| 15 | 15 |
| 16 | 16 |
| 17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam | 17 USAGE: python read2mut.py --mutFile DCS_Mutations.tabular --bamFile Interesting_Reads.trim.bam |
| 18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json | 18 --inputJson tag_count_dict.json --sscsJson SSCS_counts.json |
| 267 if len(value) < thresh: | 267 if len(value) < thresh: |
| 268 pure_tags_dict_short[key] = value | 268 pure_tags_dict_short[key] = value |
| 269 else: | 269 else: |
| 270 pure_tags_dict_short = pure_tags_dict | 270 pure_tags_dict_short = pure_tags_dict |
| 271 | 271 |
| 272 # whole_array = [] | |
| 273 # for k in pure_tags_dict.values(): | |
| 274 # if len(k) != 0: | |
| 275 # keys = k.keys() | |
| 276 # if len(keys) > 1: | |
| 277 # for k1 in keys: | |
| 278 # whole_array.append(k1) | |
| 279 # else: | |
| 280 # whole_array.append(keys[0]) | |
| 281 | |
| 282 csv_data = open(outputFile_csv, "wb") | 272 csv_data = open(outputFile_csv, "wb") |
| 283 csv_writer = csv.writer(csv_data, delimiter=",") | 273 csv_writer = csv.writer(csv_data, delimiter=",") |
| 284 | 274 |
| 285 # output summary with threshold | 275 # output summary with threshold |
| 286 workbook = xlsxwriter.Workbook(outfile) | 276 workbook = xlsxwriter.Workbook(outfile) |
| 319 counter_tier24 = 0 | 309 counter_tier24 = 0 |
| 320 counter_tier31 = 0 | 310 counter_tier31 = 0 |
| 321 counter_tier32 = 0 | 311 counter_tier32 = 0 |
| 322 counter_tier25 = 0 | 312 counter_tier25 = 0 |
| 323 counter_tier4 = 0 | 313 counter_tier4 = 0 |
| 324 # if chimera_correction: | |
| 325 # counter_tier43 = 0 | |
| 326 counter_tier51 = 0 | 314 counter_tier51 = 0 |
| 327 counter_tier52 = 0 | 315 counter_tier52 = 0 |
| 328 counter_tier53 = 0 | 316 counter_tier53 = 0 |
| 329 counter_tier54 = 0 | 317 counter_tier54 = 0 |
| 330 counter_tier55 = 0 | 318 counter_tier55 = 0 |
| 332 counter_tier7 = 0 | 320 counter_tier7 = 0 |
| 333 | 321 |
| 334 row = 1 | 322 row = 1 |
| 335 tier_dict = {} | 323 tier_dict = {} |
| 336 chimera_dict = {} | 324 chimera_dict = {} |
| 337 #change_tier_after_print = {} | |
| 338 for key1, value1 in sorted(mut_dict.items()): | 325 for key1, value1 in sorted(mut_dict.items()): |
| 339 counts_mut = 0 | 326 counts_mut = 0 |
| 340 chimeric_tag_list = [] | 327 chimeric_tag_list = [] |
| 341 chimeric_tag = {} | 328 chimeric_tag = {} |
| 342 if key1 in pure_tags_dict_short.keys(): | 329 if key1 in pure_tags_dict_short.keys(): |
| 343 change_tier_after_print = [] | 330 change_tier_after_print = [] |
| 344 | |
| 345 i = np.where(np.array(['#'.join(str(i) for i in z) | 331 i = np.where(np.array(['#'.join(str(i) for i in z) |
| 346 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] | 332 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] |
| 347 ref = mut_array[i, 2] | 333 ref = mut_array[i, 2] |
| 348 alt = mut_array[i, 3] | 334 alt = mut_array[i, 3] |
| 349 dcs_median = cvrg_dict[key1][2] | 335 dcs_median = cvrg_dict[key1][2] |
| 638 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] | 624 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] |
| 639 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] | 625 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] |
| 640 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] | 626 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] |
| 641 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 627 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 642 if any(ij is True for ij in softclipped_both_ends_idx1): | 628 if any(ij is True for ij in softclipped_both_ends_idx1): |
| 643 print(softclipped_both_ends_idx1) | |
| 644 for nr, indx in enumerate(softclipped_both_ends_idx1): | 629 for nr, indx in enumerate(softclipped_both_ends_idx1): |
| 645 if indx: | 630 if indx: |
| 646 if dist_start_read1[nr] <= dist_end_read1[nr]: | 631 if dist_start_read1[nr] <= dist_end_read1[nr]: |
| 647 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number | 632 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number |
| 648 else: | 633 else: |
| 659 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] | 644 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] |
| 660 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] | 645 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] |
| 661 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] | 646 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] |
| 662 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 647 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 663 if any(ij is True for ij in softclipped_both_ends_idx4): | 648 if any(ij is True for ij in softclipped_both_ends_idx4): |
| 664 print(softclipped_both_ends_idx4) | |
| 665 for nr, indx in enumerate(softclipped_both_ends_idx4): | 649 for nr, indx in enumerate(softclipped_both_ends_idx4): |
| 666 if indx: | 650 if indx: |
| 667 if dist_start_read4[nr] <= dist_end_read4[nr]: | 651 if dist_start_read4[nr] <= dist_end_read4[nr]: |
| 668 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number | 652 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number |
| 669 else: | 653 else: |
| 680 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] | 664 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] |
| 681 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] | 665 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] |
| 682 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] | 666 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] |
| 683 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 667 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 684 if any(ij is True for ij in softclipped_both_ends_idx2): | 668 if any(ij is True for ij in softclipped_both_ends_idx2): |
| 685 print(softclipped_both_ends_idx2) | |
| 686 for nr, indx in enumerate(softclipped_both_ends_idx2): | 669 for nr, indx in enumerate(softclipped_both_ends_idx2): |
| 687 if indx: | 670 if indx: |
| 688 if dist_start_read2[nr] <= dist_end_read2[nr]: | 671 if dist_start_read2[nr] <= dist_end_read2[nr]: |
| 689 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number | 672 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number |
| 690 else: | 673 else: |
| 701 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] | 684 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] |
| 702 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] | 685 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] |
| 703 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] | 686 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] |
| 704 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 687 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 705 if any(ij is True for ij in softclipped_both_ends_idx3): | 688 if any(ij is True for ij in softclipped_both_ends_idx3): |
| 706 print(softclipped_both_ends_idx3) | |
| 707 for nr, indx in enumerate(softclipped_both_ends_idx3): | 689 for nr, indx in enumerate(softclipped_both_ends_idx3): |
| 708 if indx: | 690 if indx: |
| 709 if dist_start_read3[nr] <= dist_end_read3[nr]: | 691 if dist_start_read3[nr] <= dist_end_read3[nr]: |
| 710 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh | 692 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh |
| 711 else: | 693 else: |
| 1089 min_tag_array2 = array2[min_index] # get whole tag with min HD | 1071 min_tag_array2 = array2[min_index] # get whole tag with min HD |
| 1090 min_value = dist.min() | 1072 min_value = dist.min() |
| 1091 # calculate HD of "b" to all "b's" or "a" to all "a's" | 1073 # calculate HD of "b" to all "b's" or "a" to all "a's" |
| 1092 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) | 1074 dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) |
| 1093 for e in min_tag_half2]) | 1075 for e in min_tag_half2]) |
| 1094 | |
| 1095 dist2 = dist_second_half.max() | 1076 dist2 = dist_second_half.max() |
| 1096 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD | 1077 max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD |
| 1097 max_tag = min_tag_array2[max_index] | 1078 max_tag = min_tag_array2[max_index] |
| 1098 | 1079 |
| 1099 # tags which have identical parts: | 1080 # tags which have identical parts: |
| 1168 else: | 1149 else: |
| 1169 chimeric_dcs_high_tiers += high_tiers | 1150 chimeric_dcs_high_tiers += high_tiers |
| 1170 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) | 1151 chimera_dict[key1] = (chimeric_dcs, chimeric_dcs_high_tiers) |
| 1171 | 1152 |
| 1172 # write to file | 1153 # write to file |
| 1173 | |
| 1174 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 | 1154 # move tier 4 counts to tier 2.5 if there other mutations with tier <= 2.4 |
| 1175 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) | 1155 sum_highTiers = sum([tier_dict[key1][ij] for ij in list(sorted(tier_dict[key1].keys()))[:6]]) |
| 1176 | |
| 1177 correct_tier = False | 1156 correct_tier = False |
| 1178 | |
| 1179 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: | 1157 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: |
| 1180 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] | 1158 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] |
| 1181 tier_dict[key1]["tier 4"] = 0 | 1159 tier_dict[key1]["tier 4"] = 0 |
| 1182 correct_tier = True | 1160 correct_tier = True |
| 1183 | 1161 |
| 1315 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), | 1293 ("Tier 1.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1) and minimal FS>=3 for at least one of the SSCS"), |
| 1316 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), | 1294 ("Tier 2.1", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS>=3 for at least one of the SSCS in at least one mate"), |
| 1317 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), | 1295 ("Tier 2.2", "both ab and ba SSCS present (>75% of the sites with alt. base) and mate pair validation (min. FS=1)"), |
| 1318 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), | 1296 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), |
| 1319 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), | 1297 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), |
| 1320 ("Tier 2.5", "variants at the start or end of the read and recurring mutation on this position in tier 1.1-2.4"), | 1298 ("Tier 2.5", "variants at the start or end of the read (ignoring variant position tier 1.1-2.4) and recurring mutation on this position in tier 1.1-2.4"), |
| 1321 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), | 1299 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), |
| 1322 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), | 1300 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), |
| 1323 ("Tier 4", "variants at the start or end of the reads"), | 1301 ("Tier 4", "variants at the start or end of the reads"), |
| 1324 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"), | 1302 ("Tier 5.1", "variant is close to softclipping in both mates and SSCS"), |
| 1325 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"), | 1303 ("Tier 5.2", "variant is close to softclipping in one of the mates but both SSCS"), |
