Mercurial > repos > mheinzl > variant_analyzer2
changeset 79:d7aea14291e8 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Mon, 25 Jul 2022 13:24:04 +0000 |
parents | fdfe9a919ff7 |
children | 8336a4f2b647 |
files | read2mut.py read2mut.xml |
diffstat | 2 files changed, 121 insertions(+), 115 deletions(-) [+] |
line wrap: on
line diff
--- a/read2mut.py Fri Jul 22 09:19:44 2022 +0000 +++ b/read2mut.py Mon Jul 25 13:24:04 2022 +0000 @@ -313,6 +313,8 @@ count_other += 1 else: count_indel += 1 + print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, low quality = %s\n" % + (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_lowq)) mut_array = np.array(mut_array) for read in bam.fetch(until_eof=True): @@ -425,6 +427,10 @@ chimeric_tag = {} if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()): # ref or alt + if key1 not in np.array(['#'.join(str(i) for i in z) + for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]): + continue + change_tier_after_print = [] i = np.where(np.array(['#'.join(str(i) for i in z) for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0] @@ -443,6 +449,7 @@ tier_dict_ref[key1][k] = v used_keys = [] + # used_keys_ref = [] if 'ab' in mut_pos_dict[key1].keys(): sscs_mut_ab = mut_pos_dict[key1]['ab'] else: @@ -465,11 +472,10 @@ if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys(): # skip reads that have not alt or ref continue - if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys): - - if key2[:-5] in pure_tags_dict_short[key1].keys(): + if (((key2[:-5] in tag_dict.keys()) and (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key1 in tag_dict[key2[:-5]].keys()) and (key2[:-5] not in used_keys)) or ((key2[:-5] in tag_dict_ref.keys()) and (key2[:-5] in pure_tags_dict_ref[key1].keys()) and (key1 in tag_dict_ref[key2[:-5]].keys()) and (key2[:-5] not in used_keys))): + if key2[:-5] in tag_dict.keys(): variant_type = "alt" - elif key2[:-5] in pure_tags_dict_ref[key1].keys(): + elif key2[:-5] in tag_dict_ref.keys(): variant_type = "ref" if key2[:-5] + '.ab.1' in mut_dict[key1].keys(): @@ -670,7 +676,10 @@ end_read4 = reads_dict[key1][key2[:-5] + '.ba.2'] ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2'] + # if variant_type == "alt": used_keys.append(key2[:-5]) + # elif variant_type == "ref": + # used_keys_ref.append(key2[:-5]) counts_mut += 1 if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)): if variant_type == "alt": @@ -1240,61 +1249,64 @@ chrom, pos, ref_a, alt_a = re.split(r'\#', key1) var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) - sample_tag = key2[:-5] - array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time - # exclude identical tag from array2, to prevent comparison to itself - same_tag = np.where(array2 == sample_tag) - index_array2 = np.arange(0, len(array2), 1) - index_withoutSame = np.delete(index_array2, same_tag) # delete identical tag from the data - array2 = array2[index_withoutSame] - if len(array2) != 0: # only perform chimera analysis if there is more than 1 variant - array1_half = sample_tag[0:int(len(sample_tag) / 2)] # mate1 part1 - array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))] # mate1 part 2 - array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2]) # mate2 part1 - array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2]) # mate2 part2 - min_tags_list_zeros = [] - chimera_tags = [] - for mate_b in [False, True]: - i = 0 # counter, only used to see how many HDs of tags were already calculated - if mate_b is False: # HD calculation for all a's - half1_mate1 = array1_half - half2_mate1 = array1_half2 - half1_mate2 = array2_half - half2_mate2 = array2_half2 - elif mate_b is True: # HD calculation for all b's - half1_mate1 = array1_half2 - half2_mate1 = array1_half - half1_mate2 = array2_half2 - half2_mate2 = array2_half - # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" - dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2]) - min_index = np.where(dist == dist.min()) # get index of min HD - # get all "b's" of the tag or all "a's" of the tag with minimum HD - min_tag_half2 = half2_mate2[min_index] - min_tag_array2 = array2[min_index] # get whole tag with min HD - min_value = dist.min() - # calculate HD of "b" to all "b's" or "a" to all "a's" - dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) - for e in min_tag_half2]) - dist2 = dist_second_half.max() - max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD - max_tag = min_tag_array2[max_index] - - # tags which have identical parts: - if min_value == 0 or dist2 == 0: - min_tags_list_zeros.append(tag) - chimera_tags.append(max_tag) - i += 1 - chimera_tags = [x for x in chimera_tags if x != []] - chimera_tags_new = [] - for i in chimera_tags: - if len(i) > 1: - for t in i: - chimera_tags_new.append(t) - else: - chimera_tags_new.extend(i) - chimera = ", ".join(chimera_tags_new) + if variant_type == "alt": + sample_tag = key2[:-5] + array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time + # exclude identical tag from array2, to prevent comparison to itself + same_tag = np.where(array2 == sample_tag) + index_array2 = np.arange(0, len(array2), 1) + index_withoutSame = np.delete(index_array2, same_tag) # delete identical tag from the data + array2 = array2[index_withoutSame] + if len(array2) != 0: # only perform chimera analysis if there is more than 1 variant + array1_half = sample_tag[0:int(len(sample_tag) / 2)] # mate1 part1 + array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))] # mate1 part 2 + array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2]) # mate2 part1 + array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2]) # mate2 part2 + min_tags_list_zeros = [] + chimera_tags = [] + for mate_b in [False, True]: + i = 0 # counter, only used to see how many HDs of tags were already calculated + if mate_b is False: # HD calculation for all a's + half1_mate1 = array1_half + half2_mate1 = array1_half2 + half1_mate2 = array2_half + half2_mate2 = array2_half2 + elif mate_b is True: # HD calculation for all b's + half1_mate1 = array1_half2 + half2_mate1 = array1_half + half1_mate2 = array2_half2 + half2_mate2 = array2_half + # calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's" + dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2]) + min_index = np.where(dist == dist.min()) # get index of min HD + # get all "b's" of the tag or all "a's" of the tag with minimum HD + min_tag_half2 = half2_mate2[min_index] + min_tag_array2 = array2[min_index] # get whole tag with min HD + min_value = dist.min() + # calculate HD of "b" to all "b's" or "a" to all "a's" + dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e)) + for e in min_tag_half2]) + dist2 = dist_second_half.max() + max_index = np.where(dist_second_half == dist_second_half.max())[0] # get index of max HD + max_tag = min_tag_array2[max_index] + # tags which have identical parts: + if min_value == 0 or dist2 == 0: + min_tags_list_zeros.append(tag) + chimera_tags.append(max_tag) + i += 1 + chimera_tags = [x for x in chimera_tags if x != []] + chimera_tags_new = [] + for i in chimera_tags: + if len(i) > 1: + for t in i: + chimera_tags_new.append(t) + else: + chimera_tags_new.extend(i) + chimera = ", ".join(chimera_tags_new) + else: + chimera_tags_new = [] + chimera = "" else: chimera_tags_new = [] chimera = "" @@ -1316,27 +1328,23 @@ if (read_pos3 == -1): read_pos3 = read_len_median3 = None line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera) - # ws1.write_row(row, 0, line) - # csv_writer.writerow(line) line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera) - # ws1.write_row(row + 1, 0, line2) - # csv_writer.writerow(line2) - # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), - # {'type': 'formula', - # 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), - # 'format': format1, - # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) - # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), - # {'type': 'formula', - # 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), - # 'format': format3, - # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) - # ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2), - # {'type': 'formula', - # 'criteria': '=$B${}>="3"'.format(row + 1), - # 'format': format2, - # 'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) - change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier)) + if tier != "4": + ws1.write_row(row, 0, line) + csv_writer.writerow(line) + ws1.write_row(row + 1, 0, line2) + csv_writer.writerow(line2) + if variant_type == "alt": + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + elif variant_type == "ref": + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + else: + change_tier_after_print.append((line, line2, trimmed_actual_high_tier)) + row += 3 if chimera_correction: @@ -1367,39 +1375,39 @@ tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"] tier_dict_ref[key1]["tier 4"] = 0 correct_tier_ref = True - - for sample in change_tier_after_print: - row_number = sample[0] - line1 = sample[1] - line2 = sample[2] - actual_high_tier = sample[3] - current_tier = list(line1)[1] - - if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier: - line1 = list(line1) - line1[1] = "2.5" - line1 = tuple(line1) - counter_tier25 += 1 - counter_tier4 -= 1 - if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier: - line1 = list(line1) - line1[1] = "2.5" - line1 = tuple(line1) - counter_tier25 += 1 - counter_tier4 -= 1 - - ws1.write_row(row_number, 0, line1) - csv_writer.writerow(line1) - ws1.write_row(row_number + 1, 0, line2) - csv_writer.writerow(line2) - if line1[2] == "alt": - ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - elif line1[2] == "ref": - ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) - ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)}) + # print(key1, "change tiers from tier 4 to tier 2.5 for {} DCS ...".format(len(change_tier_after_print))) + if len(change_tier_after_print) > 0: + for sample in change_tier_after_print: + # row_number = sample[0] + line1 = sample[0] + line2 = sample[1] + actual_high_tier = sample[2] + current_tier = list(line1)[1] + if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier: + line1 = list(line1) + line1[1] = "2.5" + line1 = tuple(line1) + counter_tier25 += 1 + counter_tier4 -= 1 + if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier: + line1 = list(line1) + line1[1] = "2.5" + line1 = tuple(line1) + counter_tier25 += 1 + counter_tier4 -= 1 + ws1.write_row(row, 0, line1) + csv_writer.writerow(line1) + ws1.write_row(row + 1, 0, line2) + csv_writer.writerow(line2) + if line1[2] == "alt": + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + elif line1[2] == "ref": + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)}) + row += 3 # sheet 2 if chimera_correction: @@ -1429,9 +1437,7 @@ ref_count = cvrg_dict[key1][0] alt_count = cvrg_dict[key1][1] cvrg = ref_count + alt_count - ref_tiers = tier_dict_ref[key1] - var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) lst = [var_id, cvrg] used_tiers = [] @@ -1453,7 +1459,7 @@ fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7]))) if fraction_chimeras is None: fraction_chimeras = 0. - new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras) + new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras) lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)]) lst.extend([alt_count, safe_div(alt_count, cvrg)]) lst.extend(used_tiers)
--- a/read2mut.xml Fri Jul 22 09:19:44 2022 +0000 +++ b/read2mut.xml Mon Jul 25 13:24:04 2022 +0000 @@ -35,7 +35,7 @@ <param name="thresh" type="integer" label="Tag count threshold" value="0" help="Integer threshold for displaying mutations. Only mutations occurring in DCS of less than thresh tags are displayed. Default of 0 displays all."/> <param name="phred" type="integer" label="Phred quality score threshold" min="0" max="41" value="20" help="Integer threshold for Phred quality score. Only reads higher than this threshold is considered. Default = 20."/> <param name="trim" type="integer" label="Trimming threshold" value="10" help="Integer threshold for assigning mutations at start and end of reads to lower tier. Default 10."/> - <param name="chimera_correction" type="boolean" label="Apply chimera correction?" truevalue="--chimera_correction" falsevalue="" checked="False" help="Count chimeric variants and correct the variant frequencies."/> + <param name="chimera_correction" type="boolean" label="Apply chimera correction?" truevalue="--chimera_correction" falsevalue="" checked="False" help="Count chimeric variants (not for the reference allele) and correct the variant frequencies."/> <param name="softclipping_dist" type="integer" label="Distance between artifact and softclipping of the reads" min="1" value="15" help="Count mutation as an artifact if mutation lies within this parameter away from the softclipping part of the reads. Default = 20"/> <param name="reads_threshold" type="float" label="Minimum percentage of softclipped reads in a family" min="0.0" max="1.0" value="1.0" help="Float number which specifies the minimum percentage of softclipped reads in a family to be considered in the softclipping tiers. Default: 1.0, means all reads of a family have to be softclipped."/> </inputs>