Mercurial > repos > mheinzl > variant_analyzer2
comparison read2mut.py @ 61:3722268ffac5 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
| author | mheinzl |
|---|---|
| date | Wed, 17 Mar 2021 13:14:40 +0000 |
| parents | 9ce53bf0931c |
| children | 66c1245436b9 |
comparison
equal
deleted
inserted
replaced
| 60:9ce53bf0931c | 61:3722268ffac5 |
|---|---|
| 124 mut_dict = {} | 124 mut_dict = {} |
| 125 mut_read_pos_dict = {} | 125 mut_read_pos_dict = {} |
| 126 mut_read_dict = {} | 126 mut_read_dict = {} |
| 127 reads_dict = {} | 127 reads_dict = {} |
| 128 mut_read_cigar_dict = {} | 128 mut_read_cigar_dict = {} |
| 129 real_start_end = {} | |
| 129 i = 0 | 130 i = 0 |
| 130 mut_array = [] | 131 mut_array = [] |
| 131 | 132 |
| 132 for count, variant in enumerate(VCF(file1)): | 133 for count, variant in enumerate(VCF(file1)): |
| 133 #if count == 2000: | 134 #if count == 2000: |
| 147 i += 1 | 148 i += 1 |
| 148 mut_dict[chrom_stop_pos] = {} | 149 mut_dict[chrom_stop_pos] = {} |
| 149 mut_read_pos_dict[chrom_stop_pos] = {} | 150 mut_read_pos_dict[chrom_stop_pos] = {} |
| 150 reads_dict[chrom_stop_pos] = {} | 151 reads_dict[chrom_stop_pos] = {} |
| 151 mut_read_cigar_dict[chrom_stop_pos] = {} | 152 mut_read_cigar_dict[chrom_stop_pos] = {} |
| 153 real_start_end[chrom_stop_pos] = {} | |
| 152 | 154 |
| 153 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | 155 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): |
| 154 if pileupcolumn.reference_pos == stop_pos: | 156 if pileupcolumn.reference_pos == stop_pos: |
| 155 count_alt = 0 | 157 count_alt = 0 |
| 156 count_ref = 0 | 158 count_ref = 0 |
| 165 n += 1 | 167 n += 1 |
| 166 if not pileupread.is_del and not pileupread.is_refskip: | 168 if not pileupread.is_del and not pileupread.is_refskip: |
| 167 tag = pileupread.alignment.query_name | 169 tag = pileupread.alignment.query_name |
| 168 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | 170 nuc = pileupread.alignment.query_sequence[pileupread.query_position] |
| 169 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 | 171 phred = ord(pileupread.alignment.qual[pileupread.query_position]) - 33 |
| 172 | |
| 173 # if read is softclipped, store real position in reference | |
| 174 if "S" in pileupread.alignment.cigarstring: | |
| 175 # spftclipped at start | |
| 176 if re.search(r"^[0-9]+S", pileupread.alignment.cigarstring): | |
| 177 start = pileupread.alignment.reference_start - int(pileupread.alignment.cigarstring.split("S")[0]) | |
| 178 end = pileupread.alignment.reference_end | |
| 179 # softclipped at end | |
| 180 elif re.search(r"S$", pileupread.alignment.cigarstring): | |
| 181 end = pileupread.alignment.reference_end + int(re.split("[A-Z]", str(pileupread.alignment.cigarstring))[-2]) | |
| 182 start = pileupread.alignment.reference_start | |
| 183 else: | |
| 184 end = pileupread.alignment.reference_end | |
| 185 start = pileupread.alignment.reference_start | |
| 186 | |
| 170 if phred < phred_score: | 187 if phred < phred_score: |
| 171 nuc = "lowQ" | 188 nuc = "lowQ" |
| 172 if tag not in mut_dict[chrom_stop_pos]: | 189 if tag not in mut_dict[chrom_stop_pos]: |
| 173 mut_dict[chrom_stop_pos][tag] = {} | 190 mut_dict[chrom_stop_pos][tag] = {} |
| 174 if nuc in mut_dict[chrom_stop_pos][tag]: | 191 if nuc in mut_dict[chrom_stop_pos][tag]: |
| 177 mut_dict[chrom_stop_pos][tag][nuc] = 1 | 194 mut_dict[chrom_stop_pos][tag][nuc] = 1 |
| 178 if tag not in mut_read_pos_dict[chrom_stop_pos]: | 195 if tag not in mut_read_pos_dict[chrom_stop_pos]: |
| 179 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] | 196 mut_read_pos_dict[chrom_stop_pos][tag] = [pileupread.query_position + 1] |
| 180 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] | 197 reads_dict[chrom_stop_pos][tag] = [len(pileupread.alignment.query_sequence)] |
| 181 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] | 198 mut_read_cigar_dict[chrom_stop_pos][tag] = [pileupread.alignment.cigarstring] |
| 182 | 199 real_start_end[chrom_stop_pos][tag] = [(start, end)] |
| 183 #alignedRefPositions = pileupread.get_reference_positions()[0] | |
| 184 else: | 200 else: |
| 185 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) | 201 mut_read_pos_dict[chrom_stop_pos][tag].append(pileupread.query_position + 1) |
| 186 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) | 202 reads_dict[chrom_stop_pos][tag].append(len(pileupread.alignment.query_sequence)) |
| 187 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) | 203 mut_read_cigar_dict[chrom_stop_pos][tag].append(pileupread.alignment.cigarstring) |
| 204 real_start_end[chrom_stop_pos][tag].append((start, end)) | |
| 188 if nuc == alt: | 205 if nuc == alt: |
| 189 count_alt += 1 | 206 count_alt += 1 |
| 190 if tag not in mut_read_dict: | 207 if tag not in mut_read_dict: |
| 191 mut_read_dict[tag] = {} | 208 mut_read_dict[tag] = {} |
| 192 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) | 209 mut_read_dict[tag][chrom_stop_pos] = (alt, ref) |
| 531 end_read1 = end_read2 = end_read3 = end_read4 = [] | 548 end_read1 = end_read2 = end_read3 = end_read4 = [] |
| 532 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): | 549 if key2[:-5] + '.ab.1' in mut_read_pos_dict[key1].keys(): |
| 533 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) | 550 read_pos1 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.1'])) |
| 534 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) | 551 read_len_median1 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.1'])) |
| 535 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'] | 552 cigars_dcs1 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.1'] |
| 536 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ab.1']) | |
| 537 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1'] | 553 pos_read1 = mut_read_pos_dict[key1][key2[:-5] + '.ab.1'] |
| 538 #print(cigars_dcs1) | |
| 539 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1'] | 554 end_read1 = reads_dict[key1][key2[:-5] + '.ab.1'] |
| 555 ref_positions1 = real_start_end[key1][key2[:-5] + '.ab.1'] | |
| 540 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): | 556 if key2[:-5] + '.ab.2' in mut_read_pos_dict[key1].keys(): |
| 541 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) | 557 read_pos2 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ab.2'])) |
| 542 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) | 558 read_len_median2 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ab.2'])) |
| 543 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2'] | 559 cigars_dcs2 = mut_read_cigar_dict[key1][key2[:-5] + '.ab.2'] |
| 544 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2'] | 560 pos_read2 = mut_read_pos_dict[key1][key2[:-5] + '.ab.2'] |
| 545 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2'] | 561 end_read2 = reads_dict[key1][key2[:-5] + '.ab.2'] |
| 562 ref_positions2 = real_start_end[key1][key2[:-5] + '.ab.2'] | |
| 546 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): | 563 if key2[:-5] + '.ba.1' in mut_read_pos_dict[key1].keys(): |
| 547 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) | 564 read_pos3 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.1'])) |
| 548 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) | 565 read_len_median3 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.1'])) |
| 549 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1'] | 566 cigars_dcs3 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.1'] |
| 550 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1'] | 567 pos_read3 = mut_read_pos_dict[key1][key2[:-5] + '.ba.1'] |
| 551 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1'] | 568 end_read3 = reads_dict[key1][key2[:-5] + '.ba.1'] |
| 569 ref_positions3 = real_start_end[key1][key2[:-5] + '.ba.1'] | |
| 552 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): | 570 if key2[:-5] + '.ba.2' in mut_read_pos_dict[key1].keys(): |
| 553 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) | 571 read_pos4 = np.median(np.array(mut_read_pos_dict[key1][key2[:-5] + '.ba.2'])) |
| 554 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) | 572 read_len_median4 = np.median(np.array(reads_dict[key1][key2[:-5] + '.ba.2'])) |
| 555 #print(mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']) | |
| 556 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'] | 573 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2'] |
| 557 | |
| 558 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2'] | 574 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2'] |
| 559 #print(cigars_dcs4) | |
| 560 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] | 575 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] |
| 576 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2'] | |
| 561 | 577 |
| 562 used_keys.append(key2[:-5]) | 578 used_keys.append(key2[:-5]) |
| 563 counts_mut += 1 | 579 counts_mut += 1 |
| 564 if (alt1f + alt2f + alt3f + alt4f) > 0.5: | 580 if (alt1f + alt2f + alt3f + alt4f) > 0.5: |
| 581 total1new_trim, total2new_trim, total3new_trim, total4new_trim = total1new, total2new, total3new, total4new | |
| 565 if total1new == 0: | 582 if total1new == 0: |
| 566 ref1f = alt1f = None | 583 ref1f = alt1f = None |
| 567 alt1ff = -1 | 584 alt1ff = -1 |
| 585 alt1ff_trim = -1 | |
| 568 else: | 586 else: |
| 569 alt1ff = alt1f | 587 alt1ff = alt1f |
| 588 alt1ff_trim = alt1f | |
| 570 if total2new == 0: | 589 if total2new == 0: |
| 571 ref2f = alt2f = None | 590 ref2f = alt2f = None |
| 572 alt2ff = -1 | 591 alt2ff = -1 |
| 592 alt2ff_trim = -1 | |
| 573 else: | 593 else: |
| 574 alt2ff = alt2f | 594 alt2ff = alt2f |
| 595 alt2ff_trim = alt2f | |
| 575 if total3new == 0: | 596 if total3new == 0: |
| 576 ref3f = alt3f = None | 597 ref3f = alt3f = None |
| 577 alt3ff = -1 | 598 alt3ff = -1 |
| 599 alt3ff_trim = -1 | |
| 578 else: | 600 else: |
| 579 alt3ff = alt3f | 601 alt3ff = alt3f |
| 602 alt3ff_trim = alt3f | |
| 580 if total4new == 0: | 603 if total4new == 0: |
| 581 ref4f = alt4f = None | 604 ref4f = alt4f = None |
| 582 alt4ff = -1 | 605 alt4ff = -1 |
| 606 alt4ff_trim = alt4f | |
| 583 else: | 607 else: |
| 584 alt4ff = alt4f | 608 alt4ff = alt4f |
| 609 alt4ff_trim = alt4f | |
| 585 | 610 |
| 586 beg1 = beg4 = beg2 = beg3 = 0 | 611 beg1 = beg4 = beg2 = beg3 = 0 |
| 587 | 612 |
| 588 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 613 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
| 589 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 614 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
| 594 softclipped_mutation_oneOfTwoMates = False | 619 softclipped_mutation_oneOfTwoMates = False |
| 595 softclipped_mutation_oneOfTwoSSCS = False | 620 softclipped_mutation_oneOfTwoSSCS = False |
| 596 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 621 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
| 597 softclipped_mutation_oneMate = False | 622 softclipped_mutation_oneMate = False |
| 598 softclipped_mutation_oneMateOneSSCS = False | 623 softclipped_mutation_oneMateOneSSCS = False |
| 599 print() | 624 |
| 600 print(key1, cigars_dcs1, cigars_dcs4, cigars_dcs2, cigars_dcs3) | 625 trimmed_actual_high_tier = False |
| 626 | |
| 601 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = [] | 627 dist_start_read1 = dist_start_read2 = dist_start_read3 = dist_start_read4 = [] |
| 602 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = [] | 628 dist_end_read1 = dist_end_read2 = dist_end_read3 = dist_end_read4 = [] |
| 603 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False | 629 ratio_dist_start1 = ratio_dist_start2 = ratio_dist_start3 = ratio_dist_start4 = False |
| 604 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False | 630 ratio_dist_end1 = ratio_dist_end2 = ratio_dist_end3 = ratio_dist_end4 = False |
| 605 | 631 |
| 606 # mate 1 - SSCS ab | 632 # mate 1 - SSCS ab |
| 607 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] | 633 softclipped_idx1 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs1] |
| 608 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads | 634 ratio1 = safe_div(sum(softclipped_idx1), float(len(softclipped_idx1))) >= threshold_reads |
| 609 | |
| 610 if any(ij is True for ij in softclipped_idx1): | 635 if any(ij is True for ij in softclipped_idx1): |
| 611 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] | 636 softclipped_both_ends_idx1 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs1] |
| 612 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] | 637 softclipped_start1 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs1] |
| 613 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] | 638 softclipped_end1 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs1] |
| 614 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] | 639 dist_start_read1 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start1, pos_read1)] |
| 615 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] | 640 dist_end_read1 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end1, pos_read1, end_read1)] |
| 616 | |
| 617 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 641 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 618 if any(ij is True for ij in softclipped_both_ends_idx1): | 642 if any(ij is True for ij in softclipped_both_ends_idx1): |
| 619 print(softclipped_both_ends_idx1) | 643 print(softclipped_both_ends_idx1) |
| 620 for nr, indx in enumerate(softclipped_both_ends_idx1): | 644 for nr, indx in enumerate(softclipped_both_ends_idx1): |
| 621 if indx: | 645 if indx: |
| 623 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number | 647 dist_end_read1[nr] = thr + 1000 # use dist of start and set start to very large number |
| 624 else: | 648 else: |
| 625 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number | 649 dist_start_read1[nr] = thr + 1000 # use dist of end and set start to very large number |
| 626 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 650 ratio_dist_start1 = safe_div(sum([True if x <= thr else False for x in dist_start_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
| 627 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads | 651 ratio_dist_end1 = safe_div(sum([True if x <= thr else False for x in dist_end_read1]), float(sum(softclipped_idx1))) >= threshold_reads |
| 628 print(key1, "mate1 ab", dist_start_read1, dist_end_read1, cigars_dcs1, ratio1, ratio_dist_start1, ratio_dist_end1) | |
| 629 | 652 |
| 630 # mate 1 - SSCS ba | 653 # mate 1 - SSCS ba |
| 631 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] | 654 softclipped_idx4 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs4] |
| 632 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads | 655 ratio4 = safe_div(sum(softclipped_idx4), float(len(softclipped_idx4))) >= threshold_reads |
| 633 if any(ij is True for ij in softclipped_idx4): | 656 if any(ij is True for ij in softclipped_idx4): |
| 634 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] | 657 softclipped_both_ends_idx4 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs4] |
| 635 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] | 658 softclipped_start4 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs4] |
| 636 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] | 659 softclipped_end4 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs4] |
| 637 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] | 660 dist_start_read4 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start4, pos_read4)] |
| 638 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] | 661 dist_end_read4 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end4, pos_read4, end_read4)] |
| 639 | |
| 640 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 662 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 641 if any(ij is True for ij in softclipped_both_ends_idx4): | 663 if any(ij is True for ij in softclipped_both_ends_idx4): |
| 642 print(softclipped_both_ends_idx4) | 664 print(softclipped_both_ends_idx4) |
| 643 for nr, indx in enumerate(softclipped_both_ends_idx4): | 665 for nr, indx in enumerate(softclipped_both_ends_idx4): |
| 644 if indx: | 666 if indx: |
| 646 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number | 668 dist_end_read4[nr] = thr + 1000 # use dist of start and set start to very large number |
| 647 else: | 669 else: |
| 648 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number | 670 dist_start_read4[nr] = thr + 1000 # use dist of end and set start to very large number |
| 649 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 671 ratio_dist_start4 = safe_div(sum([True if x <= thr else False for x in dist_start_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
| 650 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads | 672 ratio_dist_end4 = safe_div(sum([True if x <= thr else False for x in dist_end_read4]), float(sum(softclipped_idx4))) >= threshold_reads |
| 651 print(key1, "mate1 ba", dist_start_read4, dist_end_read4,cigars_dcs4, ratio4, ratio_dist_start4, ratio_dist_end4) | |
| 652 | 673 |
| 653 # mate 2 - SSCS ab | 674 # mate 2 - SSCS ab |
| 654 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] | 675 softclipped_idx2 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs2] |
| 655 #print(sum(softclipped_idx2)) | |
| 656 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads | 676 ratio2 = safe_div(sum(softclipped_idx2), float(len(softclipped_idx2))) >= threshold_reads |
| 657 if any(ij is True for ij in softclipped_idx2): | 677 if any(ij is True for ij in softclipped_idx2): |
| 658 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] | 678 softclipped_both_ends_idx2 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs2] |
| 659 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] | 679 softclipped_start2 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs2] |
| 660 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] | 680 softclipped_end2 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs2] |
| 661 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] | 681 dist_start_read2 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start2, pos_read2)] |
| 662 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] | 682 dist_end_read2 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end2, pos_read2, end_read2)] |
| 663 | |
| 664 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 683 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 665 if any(ij is True for ij in softclipped_both_ends_idx2): | 684 if any(ij is True for ij in softclipped_both_ends_idx2): |
| 666 print(softclipped_both_ends_idx2) | 685 print(softclipped_both_ends_idx2) |
| 667 for nr, indx in enumerate(softclipped_both_ends_idx2): | 686 for nr, indx in enumerate(softclipped_both_ends_idx2): |
| 668 if indx: | 687 if indx: |
| 669 if dist_start_read2[nr] <= dist_end_read2[nr]: | 688 if dist_start_read2[nr] <= dist_end_read2[nr]: |
| 670 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number | 689 dist_end_read2[nr] = thr + 1000 # use dist of start and set start to very large number |
| 671 else: | 690 else: |
| 672 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number | 691 dist_start_read2[nr] = thr + 1000 # use dist of end and set start to very large number |
| 673 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 692 ratio_dist_start2 = safe_div(sum([True if x <= thr else False for x in dist_start_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
| 674 #print(ratio_dist_end2) | |
| 675 #print([True if x <= thr else False for x in ratio_dist_end2]) | |
| 676 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads | 693 ratio_dist_end2 = safe_div(sum([True if x <= thr else False for x in dist_end_read2]), float(sum(softclipped_idx2))) >= threshold_reads |
| 677 print(key1, "mate2 ab", dist_start_read2, dist_end_read2,cigars_dcs2, ratio2, ratio_dist_start2, ratio_dist_end2) | |
| 678 | 694 |
| 679 # mate 2 - SSCS ba | 695 # mate 2 - SSCS ba |
| 680 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] | 696 softclipped_idx3 = [True if re.search(r"^[0-9]+S", string) or re.search(r"S$", string) else False for string in cigars_dcs3] |
| 681 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads | 697 ratio3 = safe_div(sum(softclipped_idx3), float(len(softclipped_idx3))) >= threshold_reads |
| 682 if any(ij is True for ij in softclipped_idx3): | 698 if any(ij is True for ij in softclipped_idx3): |
| 683 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] | 699 softclipped_both_ends_idx3 = [True if (re.search(r"^[0-9]+S", string) and re.search(r"S$", string)) else False for string in cigars_dcs3] |
| 684 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] | 700 softclipped_start3 = [int(string.split("S")[0]) if re.search(r"^[0-9]+S", string) else -1 for string in cigars_dcs3] |
| 685 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] | 701 softclipped_end3 = [int(re.split("[A-Z]", str(string))[-2]) if re.search(r"S$", string) else -1 for string in cigars_dcs3] |
| 686 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] | 702 dist_start_read3 = [(pos - soft) if soft != -1 else thr + 1000 for soft, pos in zip(softclipped_start3, pos_read3)] |
| 687 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] | 703 dist_end_read3 = [(length_read - pos - soft) if soft != -1 else thr + 1000 for soft, pos, length_read in zip(softclipped_end3, pos_read3, end_read3)] |
| 688 | |
| 689 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping | 704 # if read at both ends softclipped --> select end with smallest distance between mut position and softclipping |
| 690 if any(ij is True for ij in softclipped_both_ends_idx3): | 705 if any(ij is True for ij in softclipped_both_ends_idx3): |
| 691 print(softclipped_both_ends_idx3) | 706 print(softclipped_both_ends_idx3) |
| 692 for nr, indx in enumerate(softclipped_both_ends_idx3): | 707 for nr, indx in enumerate(softclipped_both_ends_idx3): |
| 693 if indx: | 708 if indx: |
| 694 if dist_start_read3[nr] <= dist_end_read3[nr]: | 709 if dist_start_read3[nr] <= dist_end_read3[nr]: |
| 695 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh | 710 dist_end_read3[nr] = thr + 1000 # use dist of start and set start to a larger number than thresh |
| 696 else: | 711 else: |
| 697 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number | 712 dist_start_read3[nr] = thr + 1000 # use dist of end and set start to very large number |
| 698 #print([True if x <= thr else False for x in dist_start_read3]) | |
| 699 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 713 ratio_dist_start3 = safe_div(sum([True if x <= thr else False for x in dist_start_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
| 700 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads | 714 ratio_dist_end3 = safe_div(sum([True if x <= thr else False for x in dist_end_read3]), float(sum(softclipped_idx3))) >= threshold_reads |
| 701 print(key1, "mate2 ba", dist_start_read3, dist_end_read3,cigars_dcs3, ratio3, ratio_dist_start3, ratio_dist_end3) | |
| 702 | 715 |
| 703 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant | 716 if ((all(float(ij) >= 0.5 for ij in [alt1ff, alt4ff]) & # contradictory variant |
| 704 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | | 717 all(float(ij) == 0. for ij in [alt2ff, alt3ff])) | |
| 705 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & | 718 (all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]) & |
| 706 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): | 719 all(float(ij) == 0. for ij in [alt1ff, alt4ff]))): |
| 726 alt4ff = 0 | 739 alt4ff = 0 |
| 727 alt2ff = 0 | 740 alt2ff = 0 |
| 728 alt3ff = 0 | 741 alt3ff = 0 |
| 729 trimmed = False | 742 trimmed = False |
| 730 contradictory = False | 743 contradictory = False |
| 731 print(key1, "softclipped_mutation_allMates", softclipped_mutation_allMates) | |
| 732 # information of both mates available --> only one mate softclipped | 744 # information of both mates available --> only one mate softclipped |
| 733 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | | 745 elif (((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4)) | |
| 734 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & | 746 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3))) & |
| 735 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 747 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
| 736 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 748 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
| 749 min_start1 = min(min([ij[0] for ij in ref_positions1]), min([ij[0] for ij in ref_positions4])) # red | |
| 750 min_start2 = min(min([ij[0] for ij in ref_positions2]), min([ij[0] for ij in ref_positions3])) # blue | |
| 751 max_end1 = max(max([ij[1] for ij in ref_positions1]), max([ij[1] for ij in ref_positions4])) # red | |
| 752 max_end2 = max(max([ij[1] for ij in ref_positions2]), max([ij[1] for ij in ref_positions3])) # blue | |
| 753 if (min_start1 > min_start2) or (max_end1 > max_end2): # if mate1 is red and mate2 is blue | |
| 754 softclipped_mutation_oneOfTwoMates = False | |
| 755 # blue mate at beginning softclipped | |
| 756 if min_start1 > min_start2: | |
| 757 n_spacer_barcode = min_start1 - min_start2 | |
| 758 read_pos2 = read_pos2 - n_spacer_barcode | |
| 759 read_pos3 = read_pos3 - n_spacer_barcode | |
| 760 read_len_median2 = read_len_median2 - n_spacer_barcode | |
| 761 read_len_median3 = read_len_median3 - n_spacer_barcode | |
| 762 # red mate at end softclipped | |
| 763 if max_end1 > max_end2: | |
| 764 n_spacer_barcode = max_end1 - max_end2 | |
| 765 read_len_median1 = read_len_median1 - n_spacer_barcode | |
| 766 read_len_median4 = read_len_median4 - n_spacer_barcode | |
| 767 elif (min_start1 < min_start2) or (max_end1 < max_end2): # if mate1 is blue and mate2 is red | |
| 768 softclipped_mutation_oneOfTwoMates = False | |
| 769 if min_start1 < min_start2: | |
| 770 n_spacer_barcode = min_start2 - min_start1 | |
| 771 read_pos1 = read_pos1 - n_spacer_barcode | |
| 772 read_pos4 = read_pos4 - n_spacer_barcode | |
| 773 read_len_median1 = read_len_median1 - n_spacer_barcode | |
| 774 read_len_median4 = read_len_median4 - n_spacer_barcode | |
| 775 if max_end1 < max_end2: # if mate1 ends after mate 2 starts | |
| 776 n_spacer_barcode = max_end2 - max_end1 | |
| 777 read_len_median2 = read_len_median2 - n_spacer_barcode | |
| 778 read_len_median3 = read_len_median3 - n_spacer_barcode | |
| 779 else: | |
| 780 softclipped_mutation_oneOfTwoMates = True | |
| 781 alt1ff = 0 | |
| 782 alt4ff = 0 | |
| 783 alt2ff = 0 | |
| 784 alt3ff = 0 | |
| 785 trimmed = False | |
| 786 contradictory = False | |
| 737 softclipped_mutation_allMates = False | 787 softclipped_mutation_allMates = False |
| 738 softclipped_mutation_oneOfTwoMates = True | |
| 739 softclipped_mutation_oneOfTwoSSCS = False | 788 softclipped_mutation_oneOfTwoSSCS = False |
| 740 softclipped_mutation_oneOfTwoSSCS_diffMates = False | |
| 741 softclipped_mutation_oneMate = False | 789 softclipped_mutation_oneMate = False |
| 742 softclipped_mutation_oneMateOneSSCS = False | 790 softclipped_mutation_oneMateOneSSCS = False |
| 743 alt1ff = 0 | 791 |
| 744 alt4ff = 0 | 792 if softclipped_mutation_oneOfTwoMates is False: # check trimming tier |
| 745 alt2ff = 0 | 793 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
| 746 alt3ff = 0 | 794 beg1 = total1new |
| 747 trimmed = False | 795 total1new = 0 |
| 748 contradictory = False | 796 alt1ff = 0 |
| 749 print(key1, "softclipped_mutation_oneOfTwoMates", softclipped_mutation_oneOfTwoMates) | 797 alt1f = 0 |
| 798 trimmed = True | |
| 799 | |
| 800 if ((read_pos4 >= 0) and ((read_pos4 <= trim) | (abs(read_len_median4 - read_pos4) <= trim))): | |
| 801 beg4 = total4new | |
| 802 total4new = 0 | |
| 803 alt4ff = 0 | |
| 804 alt4f = 0 | |
| 805 trimmed = True | |
| 806 | |
| 807 if ((read_pos2 >= 0) and ((read_pos2 <= trim) | (abs(read_len_median2 - read_pos2) <= trim))): | |
| 808 beg2 = total2new | |
| 809 total2new = 0 | |
| 810 alt2ff = 0 | |
| 811 alt2f = 0 | |
| 812 trimmed = True | |
| 813 | |
| 814 if ((read_pos3 >= 0) and ((read_pos3 <= trim) | (abs(read_len_median3 - read_pos3) <= trim))): | |
| 815 beg3 = total3new | |
| 816 total3new = 0 | |
| 817 alt3ff = 0 | |
| 818 alt3f = 0 | |
| 819 trimmed = True | |
| 820 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | |
| 821 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | |
| 822 | |
| 750 # information of both mates available --> only one mate softclipped | 823 # information of both mates available --> only one mate softclipped |
| 751 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 824 elif (((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
| 752 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 825 ((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
| 753 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available | 826 all(float(ij) > 0. for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): # all mates available |
| 754 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 827 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
| 762 alt4ff = 0 | 835 alt4ff = 0 |
| 763 alt2ff = 0 | 836 alt2ff = 0 |
| 764 alt3ff = 0 | 837 alt3ff = 0 |
| 765 trimmed = False | 838 trimmed = False |
| 766 contradictory = False | 839 contradictory = False |
| 767 print(key1, "softclipped_mutation_oneOfTwoSSCS", softclipped_mutation_oneOfTwoSSCS, [alt1ff, alt2ff, alt3ff, alt4ff]) | |
| 768 | 840 |
| 769 # information of one mate available --> all reads of one mate are softclipped | 841 # information of one mate available --> all reads of one mate are softclipped |
| 770 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & | 842 elif ((ratio1 & ratio4 & (ratio_dist_start1 | ratio_dist_end1) & (ratio_dist_start4 | ratio_dist_end4) & |
| 771 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | | 843 all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff])) | |
| 772 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & | 844 (ratio2 & ratio3 & (ratio_dist_start2 | ratio_dist_end2) & (ratio_dist_start3 | ratio_dist_end3) & |
| 773 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available | 845 all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) > 0. for ij in [alt2ff, alt3ff]))): # all mates available |
| 774 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 846 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
| 775 #if ((((len(dist_start_read1) > 0 | len(dist_end_read1) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1))) & | |
| 776 # ((len(dist_start_read4) > 0 | len(dist_end_read4) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4)))) | | |
| 777 # (((len(dist_start_read2) > 0 | len(dist_end_read2) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2))) & | |
| 778 # ((len(dist_start_read3) > 0 | len(dist_end_read3) > 0 ) & all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3))))): | |
| 779 softclipped_mutation_allMates = False | 847 softclipped_mutation_allMates = False |
| 780 softclipped_mutation_oneOfTwoMates = False | 848 softclipped_mutation_oneOfTwoMates = False |
| 781 softclipped_mutation_oneOfTwoSSCS = False | 849 softclipped_mutation_oneOfTwoSSCS = False |
| 782 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 850 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
| 783 softclipped_mutation_oneMate = True | 851 softclipped_mutation_oneMate = True |
| 786 alt4ff = 0 | 854 alt4ff = 0 |
| 787 alt2ff = 0 | 855 alt2ff = 0 |
| 788 alt3ff = 0 | 856 alt3ff = 0 |
| 789 trimmed = False | 857 trimmed = False |
| 790 contradictory = False | 858 contradictory = False |
| 791 print(key1, "softclipped_mutation_oneMate", softclipped_mutation_oneMate) | |
| 792 # information of one mate available --> only one SSCS is softclipped | 859 # information of one mate available --> only one SSCS is softclipped |
| 793 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & | 860 elif ((((ratio1 & (ratio_dist_start1 | ratio_dist_end1)) | (ratio4 & (ratio_dist_start4 | ratio_dist_end4))) & |
| 794 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | | 861 (all(float(ij) < 0. for ij in [alt2ff, alt3ff]) & all(float(ij) > 0. for ij in [alt1ff, alt4ff]))) | |
| 795 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & | 862 (((ratio2 & (ratio_dist_start2 | ratio_dist_end2)) | (ratio3 & (ratio_dist_start3 | ratio_dist_end3))) & |
| 796 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available | 863 (all(float(ij) < 0. for ij in [alt1ff, alt4ff]) & all(float(ij) < 0. for ij in [alt2ff, alt3ff])))): # all mates available |
| 797 # if distance between softclipping and mutation is at start or end of the read smaller than threshold | 864 # if distance between softclipping and mutation is at start or end of the read smaller than threshold |
| 798 #if ((all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read1, dist_end_read1)) | | |
| 799 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read4, dist_end_read4))) | | |
| 800 # (all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read2, dist_end_read2)) | | |
| 801 # all(ij <= thr or nm <= thr for ij, nm in zip(dist_start_read3, dist_end_read3)))): | |
| 802 softclipped_mutation_allMates = False | 865 softclipped_mutation_allMates = False |
| 803 softclipped_mutation_oneOfTwoMates = False | 866 softclipped_mutation_oneOfTwoMates = False |
| 804 softclipped_mutation_oneOfTwoSSCS = False | 867 softclipped_mutation_oneOfTwoSSCS = False |
| 805 softclipped_mutation_oneOfTwoSSCS_diffMates = False | 868 softclipped_mutation_oneOfTwoSSCS_diffMates = False |
| 806 softclipped_mutation_oneMate = False | 869 softclipped_mutation_oneMate = False |
| 809 alt4ff = 0 | 872 alt4ff = 0 |
| 810 alt2ff = 0 | 873 alt2ff = 0 |
| 811 alt3ff = 0 | 874 alt3ff = 0 |
| 812 trimmed = False | 875 trimmed = False |
| 813 contradictory = False | 876 contradictory = False |
| 814 print(key1, "softclipped_mutation_oneMateOneSSCS", softclipped_mutation_oneMateOneSSCS) | |
| 815 | 877 |
| 816 else: | 878 else: |
| 817 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): | 879 if ((read_pos1 >= 0) and ((read_pos1 <= trim) | (abs(read_len_median1 - read_pos1) <= trim))): |
| 818 beg1 = total1new | 880 beg1 = total1new |
| 819 total1new = 0 | 881 total1new = 0 |
| 841 alt3ff = 0 | 903 alt3ff = 0 |
| 842 alt3f = 0 | 904 alt3f = 0 |
| 843 trimmed = True | 905 trimmed = True |
| 844 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) | 906 details1 = (total1, total4, total1new, total4new, ref1, ref4, alt1, alt4, ref1f, ref4f, alt1f, alt4f, na1, na4, lowq1, lowq4, beg1, beg4) |
| 845 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) | 907 details2 = (total2, total3, total2new, total3new, ref2, ref3, alt2, alt3, ref2f, ref3f, alt2f, alt3f, na2, na3, lowq2, lowq3, beg2, beg3) |
| 846 | |
| 847 | |
| 848 #sum_highTiers = sum([tier_dict[key1][ij] for ij in tier_dict[key1].keys()[:6]]) | |
| 849 | 908 |
| 850 # assign tiers | 909 # assign tiers |
| 851 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & | 910 if ((all(int(ij) >= 3 for ij in [total1new, total4new]) & |
| 852 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | | 911 all(float(ij) >= 0.75 for ij in [alt1ff, alt4ff])) | |
| 853 (all(int(ij) >= 3 for ij in [total2new, total3new]) & | 912 (all(int(ij) >= 3 for ij in [total2new, total3new]) & |
| 913 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): | 972 all(float(ij) >= 0.5 for ij in [alt2ff, alt3ff]))): |
| 914 tier = "3.2" | 973 tier = "3.2" |
| 915 counter_tier32 += 1 | 974 counter_tier32 += 1 |
| 916 tier_dict[key1]["tier 3.2"] += 1 | 975 tier_dict[key1]["tier 3.2"] += 1 |
| 917 | 976 |
| 918 #elif (trimmed) and (sum_highTiers > 1): | |
| 919 # tier = "2.5" | |
| 920 # counter_tier25 += 1 | |
| 921 # tier_dict[key1]["tier 2.5"] += 1 | |
| 922 | |
| 923 elif (trimmed): | 977 elif (trimmed): |
| 924 tier = "4" | 978 tier = "4" |
| 925 counter_tier4 += 1 | 979 counter_tier4 += 1 |
| 926 tier_dict[key1]["tier 4"] += 1 | 980 tier_dict[key1]["tier 4"] += 1 |
| 981 | |
| 982 # assign tiers | |
| 983 if ((all(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
| 984 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | |
| 985 (all(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
| 986 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | |
| 987 trimmed_actual_high_tier = True | |
| 988 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & | |
| 989 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
| 990 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
| 991 all(float(ij) >= 0.75 for ij in [alt1ff, alt2ff, alt3ff, alt4ff])): | |
| 992 trimmed_actual_high_tier = True | |
| 993 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | |
| 994 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
| 995 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | |
| 996 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | |
| 997 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
| 998 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | |
| 999 trimmed_actual_high_tier = True | |
| 1000 elif (all(int(ij) >= 1 for ij in [total1new_trim, total2new_trim, total3new_trim, total4new_trim]) & | |
| 1001 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt2ff_trim, alt3ff_trim, alt4ff_trim])): | |
| 1002 trimmed_actual_high_tier = True | |
| 1003 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | |
| 1004 any(int(ij) >= 3 for ij in [total2new_trim, total3new_trim]) & | |
| 1005 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]) & | |
| 1006 any(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim])) | | |
| 1007 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | |
| 1008 any(int(ij) >= 3 for ij in [total1new_trim, total4new_trim]) & | |
| 1009 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]) & | |
| 1010 any(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim]))): | |
| 1011 trimmed_actual_high_tier = True | |
| 1012 elif ((all(int(ij) >= 1 for ij in [total1new_trim, total4new_trim]) & | |
| 1013 all(float(ij) >= 0.75 for ij in [alt1ff_trim, alt4ff_trim])) | | |
| 1014 (all(int(ij) >= 1 for ij in [total2new_trim, total3new_trim]) & | |
| 1015 all(float(ij) >= 0.75 for ij in [alt2ff_trim, alt3ff_trim]))): | |
| 1016 trimmed_actual_high_tier = True | |
| 1017 else: | |
| 1018 trimmed_actual_high_tier = False | |
| 927 | 1019 |
| 928 elif softclipped_mutation_allMates: | 1020 elif softclipped_mutation_allMates: |
| 929 tier = "5.1" | 1021 tier = "5.1" |
| 930 counter_tier51 += 1 | 1022 counter_tier51 += 1 |
| 931 tier_dict[key1]["tier 5.1"] += 1 | 1023 tier_dict[key1]["tier 5.1"] += 1 |
| 1006 | 1098 |
| 1007 # tags which have identical parts: | 1099 # tags which have identical parts: |
| 1008 if min_value == 0 or dist2 == 0: | 1100 if min_value == 0 or dist2 == 0: |
| 1009 min_tags_list_zeros.append(tag) | 1101 min_tags_list_zeros.append(tag) |
| 1010 chimera_tags.append(max_tag) | 1102 chimera_tags.append(max_tag) |
| 1011 # chimeric = True | |
| 1012 # else: | |
| 1013 # chimeric = False | |
| 1014 | |
| 1015 # if mate_b is False: | |
| 1016 # text = "pos {}: sample tag: {}; HD a = {}; HD b' = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, min_value, dist2, list(max_tag), chimeric) | |
| 1017 # else: | |
| 1018 # text = "pos {}: sample tag: {}; HD a' = {}; HD b = {}; similar tag(s): {}; chimeric = {}".format(pos, sample_tag, dist2, min_value, list(max_tag), chimeric) | |
| 1019 i += 1 | 1103 i += 1 |
| 1020 chimera_tags = [x for x in chimera_tags if x != []] | 1104 chimera_tags = [x for x in chimera_tags if x != []] |
| 1021 chimera_tags_new = [] | 1105 chimera_tags_new = [] |
| 1022 for i in chimera_tags: | 1106 for i in chimera_tags: |
| 1023 if len(i) > 1: | 1107 if len(i) > 1: |
| 1070 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) | 1154 # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) |
| 1071 #if trimmed: | 1155 #if trimmed: |
| 1072 #if key1 not in list(change_tier_after_print.keys()): | 1156 #if key1 not in list(change_tier_after_print.keys()): |
| 1073 # change_tier_after_print[key1] = [((row, line, line2))] | 1157 # change_tier_after_print[key1] = [((row, line, line2))] |
| 1074 #else: | 1158 #else: |
| 1075 change_tier_after_print.append((row, line, line2)) | 1159 change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier)) |
| 1076 | 1160 |
| 1077 row += 3 | 1161 row += 3 |
| 1078 | 1162 |
| 1079 if chimera_correction: | 1163 if chimera_correction: |
| 1080 chimeric_dcs_high_tiers = 0 | 1164 chimeric_dcs_high_tiers = 0 |
| 1099 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: | 1183 if tier_dict[key1]["tier 4"] > 0 and sum_highTiers > 0: |
| 1100 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] | 1184 tier_dict[key1]["tier 2.5"] = tier_dict[key1]["tier 4"] |
| 1101 tier_dict[key1]["tier 4"] = 0 | 1185 tier_dict[key1]["tier 4"] = 0 |
| 1102 correct_tier = True | 1186 correct_tier = True |
| 1103 | 1187 |
| 1104 #lines = change_tier_after_print[key1] | |
| 1105 for sample in change_tier_after_print: | 1188 for sample in change_tier_after_print: |
| 1106 row_number = sample[0] | 1189 row_number = sample[0] |
| 1107 line1 = sample[1] | 1190 line1 = sample[1] |
| 1108 line2 = sample[2] | 1191 line2 = sample[2] |
| 1109 | 1192 actual_high_tier = sample[3] |
| 1110 if correct_tier and list(line1)[1] == "4": | 1193 current_tier = list(line1)[1] |
| 1194 | |
| 1195 if correct_tier and (current_tier == "4") and actual_high_tier: | |
| 1111 line1 = list(line1) | 1196 line1 = list(line1) |
| 1112 line1[1] = "2.5" | 1197 line1[1] = "2.5" |
| 1113 line1 = tuple(line1) | 1198 line1 = tuple(line1) |
| 1114 counter_tier25 += 1 | 1199 counter_tier25 += 1 |
| 1115 counter_tier4 -= 1 | 1200 counter_tier4 -= 1 |
