comparison read2mut.py @ 7:ded0dc6a20d3 draft

planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8
author mheinzl
date Mon, 25 Jan 2021 13:21:55 +0000
parents 11a2a34f8a2b
children ced1a529e7cd
comparison
equal deleted inserted replaced
6:11a2a34f8a2b 7:ded0dc6a20d3
128 for count, variant in enumerate(VCF(file1)): 128 for count, variant in enumerate(VCF(file1)):
129 #if count == 2000: 129 #if count == 2000:
130 # break 130 # break
131 chrom = variant.CHROM 131 chrom = variant.CHROM
132 stop_pos = variant.start 132 stop_pos = variant.start
133 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) 133 #chrom_stop_pos = str(chrom) + "#" + str(stop_pos)
134 ref = variant.REF 134 ref = variant.REF
135 alt = variant.ALT[0] 135 alt = variant.ALT[0]
136 # nc = variant.format('NC') 136 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt
137 ad = variant.format('AD') 137
138 if len(ref) == len(alt): 138 if len(ref) == len(alt):
139 mut_array.append([chrom, stop_pos, ref, alt]) 139 mut_array.append([chrom, stop_pos, ref, alt])
140 i += 1 140 i += 1
141 mut_dict[chrom_stop_pos] = {} 141 mut_dict[chrom_stop_pos] = {}
142 mut_read_pos_dict[chrom_stop_pos] = {} 142 mut_read_pos_dict[chrom_stop_pos] = {}
214 bam.close() 214 bam.close()
215 215
216 # create pure_tags_dict 216 # create pure_tags_dict
217 pure_tags_dict = {} 217 pure_tags_dict = {}
218 for key1, value1 in sorted(mut_dict.items()): 218 for key1, value1 in sorted(mut_dict.items()):
219 if len(np.where(np.array(['#'.join(str(i) for i in z) 219 #if len(np.where(np.array(['#'.join(str(i) for i in z)
220 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0: 220 # for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0]) == 0:
221 continue 221 # continue
222 222
223 i = np.where(np.array(['#'.join(str(i) for i in z) 223 i = np.where(np.array(['#'.join(str(i) for i in z)
224 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0] 224 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
225 ref = mut_array[i, 2] 225 ref = mut_array[i, 2]
226 alt = mut_array[i, 3] 226 alt = mut_array[i, 3]
227 pure_tags_dict[key1] = {} 227 pure_tags_dict[key1] = {}
228 for key2, value2 in sorted(value1.items()): 228 for key2, value2 in sorted(value1.items()):
229 for key3, value3 in value2.items(): 229 for key3, value3 in value2.items():
308 counts_mut = 0 308 counts_mut = 0
309 chimeric_tag_list = [] 309 chimeric_tag_list = []
310 chimeric_tag = {} 310 chimeric_tag = {}
311 if key1 in pure_tags_dict_short.keys(): 311 if key1 in pure_tags_dict_short.keys():
312 i = np.where(np.array(['#'.join(str(i) for i in z) 312 i = np.where(np.array(['#'.join(str(i) for i in z)
313 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0] 313 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
314 ref = mut_array[i, 2] 314 ref = mut_array[i, 2]
315 alt = mut_array[i, 3] 315 alt = mut_array[i, 3]
316 dcs_median = cvrg_dict[key1][2] 316 dcs_median = cvrg_dict[key1][2]
317 whole_array = pure_tags_dict_short[key1].keys() 317 whole_array = pure_tags_dict_short[key1].keys()
318 318
927 else: 927 else:
928 tier = "6" 928 tier = "6"
929 counter_tier6 += 1 929 counter_tier6 += 1
930 tier_dict[key1]["tier 6"] += 1 930 tier_dict[key1]["tier 6"] += 1
931 931
932 chrom, pos = re.split(r'\#', key1) 932 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
933 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) 933 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
934 sample_tag = key2[:-5] 934 sample_tag = key2[:-5]
935 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time 935 array2 = np.unique(whole_array) # remove duplicate sequences to decrease running time
936 # exclude identical tag from array2, to prevent comparison to itself 936 # exclude identical tag from array2, to prevent comparison to itself
937 same_tag = np.where(array2 == sample_tag) 937 same_tag = np.where(array2 == sample_tag)
1065 row = 0 1065 row = 0
1066 1066
1067 for key1, value1 in sorted(tier_dict.items()): 1067 for key1, value1 in sorted(tier_dict.items()):
1068 if key1 in pure_tags_dict_short.keys(): 1068 if key1 in pure_tags_dict_short.keys():
1069 i = np.where(np.array(['#'.join(str(i) for i in z) 1069 i = np.where(np.array(['#'.join(str(i) for i in z)
1070 for z in zip(mut_array[:, 0], mut_array[:, 1])]) == key1)[0][0] 1070 for z in zip(mut_array[:, 0], mut_array[:, 1], mut_array[:, 2], mut_array[:, 3])]) == key1)[0][0]
1071 ref = mut_array[i, 2] 1071 ref = mut_array[i, 2]
1072 alt = mut_array[i, 3] 1072 alt = mut_array[i, 3]
1073 chrom, pos = re.split(r'\#', key1) 1073 chrom, pos, ref_a, alt_a = re.split(r'\#', key1)
1074 ref_count = cvrg_dict[key1][0] 1074 ref_count = cvrg_dict[key1][0]
1075 alt_count = cvrg_dict[key1][1] 1075 alt_count = cvrg_dict[key1][1]
1076 cvrg = ref_count + alt_count 1076 cvrg = ref_count + alt_count
1077 1077
1078 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt]) 1078 var_id = '-'.join([chrom, str(int(pos)+1), ref, alt])
1152 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"), 1152 ("Tier 2.3", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in one mate and minimal FS>=3 for at least one of the SSCS in the other mate"),
1153 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"), 1153 ("Tier 2.4", "both ab and ba SSCS present (>75% of the sites with alt. base) and minimal FS=1 for both SSCS in at least one mate"),
1154 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"), 1154 ("Tier 3.1", "both ab and ba SSCS present (>50% of the sites with alt. base) and recurring mutation on this position"),
1155 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"), 1155 ("Tier 3.2", "both ab and ba SSCS present (>50% of the sites with alt. base) and minimal FS>=1 for both SSCS in at least one mate"),
1156 ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"), 1156 ("Tier 4.1", "variants at the start or end of the reads"), ("Tier 4.2", "mates with contradictory information"),
1157 ("Tier 5.1", "variants is close to softclipping in both mates"), 1157 ("Tier 5.1", "variant is close to softclipping in both mates"),
1158 ("Tier 5.2", "variants is close to softclipping in one of the mates"), 1158 ("Tier 5.2", "variant is close to softclipping in one of the mates"),
1159 ("Tier 5.3", "variants is close to softclipping in one of the SSCS of both mates"), 1159 ("Tier 5.3", "variant is close to softclipping in one of the SSCS of both mates"),
1160 ("Tier 5.4", "variants is close to softclipping in one mate (no information of second mate"), 1160 ("Tier 5.4", "variant is close to softclipping in one mate (no information of second mate"),
1161 ("Tier 5.5", "variants is close to softclipping in one of the SSCS (no information of the second mate"), 1161 ("Tier 5.5", "variant is close to softclipping in one of the SSCS (no information of the second mate"),
1162 ("Tier 6", "remaining variants")] 1162 ("Tier 6", "remaining variants")]
1163 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289", 1163 examples_tiers = [[("Chr5:5-20000-11068-C-G", "1.1", "AAAAAGATGCCGACTACCTT", "ab1.ba2", "254", "228", "287", "288", "289",
1164 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0", 1164 "3", "6", "3", "6", "0", "0", "3", "6", "0", "0", "1", "1", "0", "0", "0", "0", "0", "0",
1165 "4081", "4098", "5", "10", "", ""), 1165 "4081", "4098", "5", "10", "", ""),
1166 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None, 1166 ("", "", "AAAAAGATGCCGACTACCTT", "ab2.ba1", None, None, None, None,