variant_analyzer2: read2mut.py comparison

comparison read2mut.py @ 79:d7aea14291e8 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty

author	mheinzl
date	Mon, 25 Jul 2022 13:24:04 +0000
parents	fdfe9a919ff7
children	8336a4f2b647

comparison

equal deleted inserted replaced

-:fdfe9a919ff7
+:d7aea14291e8
 count_lowq += 1
 else:
 count_other += 1
 else:
 count_indel += 1
+print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, low quality = %s\n" %
+(pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, count_lowq))
 mut_array = np.array(mut_array)
 for read in bam.fetch(until_eof=True):
 if read.is_unmapped:
 pure_tag = read.query_name[:-5]
 counts_mut = 0
 chimeric_tag_list = []
 chimeric_tag = {}
 if (key1 in pure_tags_dict_short.keys()) or (key1 in pure_tags_dict_ref.keys()):  # ref or alt
+if key1 not in np.array(['#'.join(str(i) for i in z)
+for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]):
+continue
 change_tier_after_print = []
 i = np.where(np.array(['#'.join(str(i) for i in z)
 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
 ref = mut_array[i, 2]
 alt = mut_array[i, 3]
 for k, v in values_tier_dict:
 tier_dict[key1][k] = v
 tier_dict_ref[key1][k] = v
 used_keys = []
+# used_keys_ref = []
 if 'ab' in mut_pos_dict[key1].keys():
 sscs_mut_ab = mut_pos_dict[key1]['ab']
 else:
 sscs_mut_ab = 0
 if 'ba' in mut_pos_dict[key1].keys():
 add_mut14 = ""
 add_mut23 = ""
 if key2[:-5] not in tag_dict.keys() and key2[:-5] not in tag_dict_ref.keys():  # skip reads that have not alt or ref
 continue
-if ((key2[:-5] in tag_dict.keys() and key2[:-5] in pure_tags_dict_short[key1].keys() and key1 in tag_dict[key2[:-5]].keys()) or (key2[:-5] in tag_dict_ref.keys() and key2[:-5] in pure_tags_dict_ref[key1].keys() and key1 in tag_dict_ref[key2[:-5]].keys())) and (key2[:-5] not in used_keys):
+if (((key2[:-5] in tag_dict.keys()) and (key2[:-5] in pure_tags_dict_short[key1].keys()) and (key1 in tag_dict[key2[:-5]].keys()) and (key2[:-5] not in used_keys)) or ((key2[:-5] in tag_dict_ref.keys()) and (key2[:-5] in pure_tags_dict_ref[key1].keys()) and (key1 in tag_dict_ref[key2[:-5]].keys()) and (key2[:-5] not in used_keys))):
+if key2[:-5] in tag_dict.keys():
-if key2[:-5] in pure_tags_dict_short[key1].keys():
 variant_type = "alt"
-elif key2[:-5] in pure_tags_dict_ref[key1].keys():
+elif key2[:-5] in tag_dict_ref.keys():
 variant_type = "ref"
 if key2[:-5] + '.ab.1' in mut_dict[key1].keys():
 total1 = sum(mut_dict[key1][key2[:-5] + '.ab.1'].values())
 if 'na' in mut_dict[key1][key2[:-5] + '.ab.1'].keys():
 cigars_dcs4 = mut_read_cigar_dict[key1][key2[:-5] + '.ba.2']
 pos_read4 = mut_read_pos_dict[key1][key2[:-5] + '.ba.2']
 end_read4 = reads_dict[key1][key2[:-5] + '.ba.2']
 ref_positions4 = real_start_end[key1][key2[:-5] + '.ba.2']
+# if variant_type == "alt":
 used_keys.append(key2[:-5])
+# elif variant_type == "ref":
+#     used_keys_ref.append(key2[:-5])
 counts_mut += 1
 if (variant_type == "alt" and ((alt1f + alt2f + alt3f + alt4f) > 0.5)) or (variant_type == "ref" and ((ref1f + ref2f + ref3f + ref4f) > 0.5)):
 if variant_type == "alt":
 tier1ff, tier2ff, tier3ff, tier4ff = alt1f, alt2f, alt3f, alt4f
 tier1ff_trim, tier2ff_trim, tier3ff_trim, tier4ff_trim = alt1f, alt2f, alt3f, alt4f
 elif variant_type == "ref":
 tier_dict_ref[key1]["tier 7"] += 1
 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
-sample_tag = key2[:-5]
-array2 = np.unique(whole_array)  # remove duplicate sequences to decrease running time
+if variant_type == "alt":
-# exclude identical tag from array2, to prevent comparison to itself
+sample_tag = key2[:-5]
-same_tag = np.where(array2 == sample_tag)
+array2 = np.unique(whole_array)  # remove duplicate sequences to decrease running time
-index_array2 = np.arange(0, len(array2), 1)
+# exclude identical tag from array2, to prevent comparison to itself
-index_withoutSame = np.delete(index_array2, same_tag)  # delete identical tag from the data
+same_tag = np.where(array2 == sample_tag)
-array2 = array2[index_withoutSame]
+index_array2 = np.arange(0, len(array2), 1)
-if len(array2) != 0:  # only perform chimera analysis if there is more than 1 variant
+index_withoutSame = np.delete(index_array2, same_tag)  # delete identical tag from the data
-array1_half = sample_tag[0:int(len(sample_tag) / 2)]  # mate1 part1
+array2 = array2[index_withoutSame]
-array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))]  # mate1 part 2
+if len(array2) != 0:  # only perform chimera analysis if there is more than 1 variant
-array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2])  # mate2 part1
+array1_half = sample_tag[0:int(len(sample_tag) / 2)]  # mate1 part1
-array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2])  # mate2 part2
+array1_half2 = sample_tag[int(len(sample_tag) / 2):int(len(sample_tag))]  # mate1 part 2
+array2_half = np.array([ii[0:int(len(ii) / 2)] for ii in array2])  # mate2 part1
-min_tags_list_zeros = []
+array2_half2 = np.array([ii[int(len(ii) / 2):int(len(ii))] for ii in array2])  # mate2 part2
-chimera_tags = []
+min_tags_list_zeros = []
-for mate_b in [False, True]:
+chimera_tags = []
-i = 0  # counter, only used to see how many HDs of tags were already calculated
+for mate_b in [False, True]:
-if mate_b is False:  # HD calculation for all a's
+i = 0  # counter, only used to see how many HDs of tags were already calculated
-half1_mate1 = array1_half
+if mate_b is False:  # HD calculation for all a's
-half2_mate1 = array1_half2
+half1_mate1 = array1_half
-half1_mate2 = array2_half
+half2_mate1 = array1_half2
-half2_mate2 = array2_half2
+half1_mate2 = array2_half
-elif mate_b is True:  # HD calculation for all b's
+half2_mate2 = array2_half2
-half1_mate1 = array1_half2
+elif mate_b is True:  # HD calculation for all b's
-half2_mate1 = array1_half
+half1_mate1 = array1_half2
-half1_mate2 = array2_half2
+half2_mate1 = array1_half
-half2_mate2 = array2_half
+half1_mate2 = array2_half2
-# calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
+half2_mate2 = array2_half
-dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
+# calculate HD of "a" in the tag to all "a's" or "b" in the tag to all "b's"
-min_index = np.where(dist == dist.min())  # get index of min HD
+dist = np.array([sum(itertools.imap(operator.ne, half1_mate1, c)) for c in half1_mate2])
-# get all "b's" of the tag or all "a's" of the tag with minimum HD
+min_index = np.where(dist == dist.min())  # get index of min HD
-min_tag_half2 = half2_mate2[min_index]
+# get all "b's" of the tag or all "a's" of the tag with minimum HD
-min_tag_array2 = array2[min_index]  # get whole tag with min HD
+min_tag_half2 = half2_mate2[min_index]
-min_value = dist.min()
+min_tag_array2 = array2[min_index]  # get whole tag with min HD
-# calculate HD of "b" to all "b's" or "a" to all "a's"
+min_value = dist.min()
-dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
+# calculate HD of "b" to all "b's" or "a" to all "a's"
-for e in min_tag_half2])
+dist_second_half = np.array([sum(itertools.imap(operator.ne, half2_mate1, e))
-dist2 = dist_second_half.max()
+for e in min_tag_half2])
-max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
+dist2 = dist_second_half.max()
-max_tag = min_tag_array2[max_index]
+max_index = np.where(dist_second_half == dist_second_half.max())[0]  # get index of max HD
+max_tag = min_tag_array2[max_index]
 # tags which have identical parts:
 if min_value == 0 or dist2 == 0:
 min_tags_list_zeros.append(tag)
 chimera_tags.append(max_tag)
 i += 1
 chimera_tags = [x for x in chimera_tags if x != []]
 chimera_tags_new = []
 for i in chimera_tags:
 if len(i) > 1:
 for t in i:
 chimera_tags_new.append(t)
 else:
 chimera_tags_new.extend(i)
 chimera = ", ".join(chimera_tags_new)
+else:
+chimera_tags_new = []
+chimera = ""
 else:
 chimera_tags_new = []
 chimera = ""
 if len(chimera_tags_new) > 0:
 if (read_pos2 == -1):
 read_pos2 = read_len_median2 = None
 if (read_pos3 == -1):
 read_pos3 = read_len_median3 = None
 line = (var_id, tier, variant_type, key2[:-5], 'ab1.ba2', read_pos1, read_pos4, read_len_median1, read_len_median4, dcs_median) + details1 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut14, chimera)
-# ws1.write_row(row, 0, line)
-# csv_writer.writerow(line)
 line2 = ("", "", "", key2[:-5], 'ab2.ba1', read_pos2, read_pos3, read_len_median2, read_len_median3, dcs_median) + details2 + (sscs_mut_ab, sscs_mut_ba, sscs_ref_ab, sscs_ref_ba, add_mut23, chimera)
-# ws1.write_row(row + 1, 0, line2)
+if tier != "4":
-# csv_writer.writerow(line2)
+ws1.write_row(row, 0, line)
-# ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
+csv_writer.writerow(line)
-#                       {'type': 'formula',
+ws1.write_row(row + 1, 0, line2)
-#                        'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1),
+csv_writer.writerow(line2)
-#                        'format': format1,
+if variant_type == "alt":
-#                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-# ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-#                       {'type': 'formula',
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-#                        'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1),
+elif variant_type == "ref":
-#                        'format': format3,
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-#                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-# ws1.conditional_format('L{}:M{}'.format(row + 1, row + 2),
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-#                       {'type': 'formula',
+else:
-#                        'criteria': '=$B${}>="3"'.format(row + 1),
+change_tier_after_print.append((line, line2, trimmed_actual_high_tier))
-#                        'format': format2,
-#                        'multi_range': 'L{}:M{} T{}:U{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-change_tier_after_print.append((row, line, line2, trimmed_actual_high_tier))
 row += 3
 if chimera_correction:
 chimeric_dcs_high_tiers = 0
 chimeric_dcs = 0
 if tier_dict_ref[key1]["tier 4"] > 0 and sum_highTiers_ref > 0:
 tier_dict_ref[key1]["tier 2.5"] = tier_dict_ref[key1]["tier 4"]
 tier_dict_ref[key1]["tier 4"] = 0
 correct_tier_ref = True
+# print(key1, "change tiers from tier 4 to tier 2.5 for {} DCS ...".format(len(change_tier_after_print)))
-for sample in change_tier_after_print:
+if len(change_tier_after_print) > 0:
-row_number = sample[0]
+for sample in change_tier_after_print:
-line1 = sample[1]
+# row_number = sample[0]
-line2 = sample[2]
+line1 = sample[0]
-actual_high_tier = sample[3]
+line2 = sample[1]
-current_tier = list(line1)[1]
+actual_high_tier = sample[2]
+current_tier = list(line1)[1]
 if line1[2] == "alt" and correct_tier and (current_tier == "4") and actual_high_tier:
 line1 = list(line1)
 line1[1] = "2.5"
 line1 = tuple(line1)
 counter_tier25 += 1
 counter_tier4 -= 1
 if line1[2] == "ref" and correct_tier_ref and (current_tier == "4") and actual_high_tier:
 line1 = list(line1)
 line1[1] = "2.5"
 line1 = tuple(line1)
 counter_tier25 += 1
 counter_tier4 -= 1
+ws1.write_row(row, 0, line1)
-ws1.write_row(row_number, 0, line1)
+csv_writer.writerow(line1)
-csv_writer.writerow(line1)
+ws1.write_row(row + 1, 0, line2)
-ws1.write_row(row_number + 1, 0, line2)
+csv_writer.writerow(line2)
-csv_writer.writerow(line2)
+if line1[2] == "alt":
-if line1[2] == "alt":
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} U{}:V{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+elif line1[2] == "ref":
-elif line1[2] == "ref":
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row + 1, row + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="1.1", $B${}="1.2")'.format(row_number + 1, row_number + 1), 'format': format1, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row + 1, row + 1, row + 1, row + 1, row + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=OR($B${}="2.1", $B${}="2.2", $B${}="2.3", $B${}="2.4", $B${}="2.5")'.format(row_number + 1, row_number + 1, row_number + 1, row_number + 1, row_number + 1), 'format': format3, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+ws1.conditional_format('M{}:N{}'.format(row + 1, row + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row + 1, row + 2, row + 1, row + 2, row + 1, row + 2)})
-ws1.conditional_format('M{}:N{}'.format(row_number + 1, row_number + 2), {'type': 'formula', 'criteria': '=$B${}>="3"'.format(row_number + 1), 'format': format2, 'multi_range': 'M{}:N{} S{}:T{} B{}'.format(row_number + 1, row_number + 2, row_number + 1, row_number + 2, row_number + 1, row_number + 2)})
+row += 3
 # sheet 2
 if chimera_correction:
 header_line2 = ('variant ID', 'cvrg', 'AC alt (all tiers)', 'AF (all tiers)', 'cvrg (tiers 1.1-2.5)', 'AC ref (tiers 1.1-2.5)', 'AC alt (tiers 1.1-2.5)', 'AF (tiers 1.1-2.5)', 'chimera-corrected cvrg (tiers 1.1-2.5)', 'chimeras in AC alt (tiers 1.1-2.5)', 'chimera-corrected AF (tiers 1.1-2.5)', 'AC alt (orginal DCS)', 'AF (original DCS)',
 'tier 1.1 (alt)', 'tier 1.2 (alt)', 'tier 2.1 (alt)', 'tier 2.2 (alt)', 'tier 2.3 (alt)', 'tier 2.4 (alt)', 'tier 2.5 (alt)',
 alt = mut_array[i, 3]
 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
 ref_count = cvrg_dict[key1][0]
 alt_count = cvrg_dict[key1][1]
 cvrg = ref_count + alt_count
 ref_tiers = tier_dict_ref[key1]
 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
 lst = [var_id, cvrg]
 used_tiers = []
 used_tiers_ref = [t for k, t in sorted(ref_tiers.items())]
 cum_af = []
 chimeras_all = chimera_dict[key1][1]
 new_alt = sum(used_tiers[0:7]) - chimeras_all
 fraction_chimeras = safe_div(chimeras_all, float(sum(used_tiers[0:7])))
 if fraction_chimeras is None:
 fraction_chimeras = 0.
-new_cvrg = (sum(used_tiers_ref[0:7]) + sum(used_tiers[0:7])) * (1. - fraction_chimeras)
+new_cvrg = (cvrg - sum(used_tiers[-10:])) * (1. - fraction_chimeras)
 lst.extend([new_cvrg, chimeras_all, safe_div(new_alt, new_cvrg)])
 lst.extend([alt_count, safe_div(alt_count, cvrg)])
 lst.extend(used_tiers)
 lst.extend(used_tiers_ref)
 # lst.extend(cum_af)

Mercurial > repos > mheinzl > variant_analyzer2

comparison read2mut.py @ 79:d7aea14291e8 draft