Mercurial > repos > mheinzl > variant_analyzer2
comparison mut2read.py @ 78:fdfe9a919ff7 draft
planemo upload for repository https://github.com/Single-Molecule-Genetics/VariantAnalyzerGalaxy/tree/master/tools/variant_analyzer commit ee4a8e6cf290e6c8a4d55f9cd2839d60ab3b11c8-dirty
author | mheinzl |
---|---|
date | Fri, 22 Jul 2022 09:19:44 +0000 |
parents | 6ccff403db8a |
children | e46d5e377760 |
comparison
equal
deleted
inserted
replaced
77:1797e461d674 | 78:fdfe9a919ff7 |
---|---|
18 """ | 18 """ |
19 | 19 |
20 import argparse | 20 import argparse |
21 import json | 21 import json |
22 import os | 22 import os |
23 import re | |
23 import sys | 24 import sys |
24 | 25 |
25 import numpy as np | 26 import numpy as np |
26 import pysam | 27 import pysam |
27 from cyvcf2 import VCF | 28 from cyvcf2 import VCF |
66 bam = pysam.AlignmentFile(file2, "rb") | 67 bam = pysam.AlignmentFile(file2, "rb") |
67 | 68 |
68 # get tags | 69 # get tags |
69 tag_dict = {} | 70 tag_dict = {} |
70 cvrg_dict = {} | 71 cvrg_dict = {} |
72 tag_dict_ref = {} | |
71 | 73 |
72 for variant in VCF(file1): | 74 for variant in VCF(file1): |
73 chrom = variant.CHROM | 75 chrom = variant.CHROM |
74 stop_pos = variant.start | 76 stop_pos = variant.start |
75 ref = variant.REF | 77 ref = variant.REF |
76 if len(variant.ALT) == 0: | 78 if len(variant.ALT) == 0: |
77 continue | 79 continue |
78 else: | 80 else: |
79 alt = variant.ALT[0] | 81 alt = variant.ALT[0] |
82 alt = alt.upper() | |
83 ref = ref.upper() | |
84 if "N" in alt: # skip indels with N in alt allele --> it is not an indel but just a mismatch at the position where the N is (checked this in IGV) | |
85 continue | |
80 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt | 86 chrom_stop_pos = str(chrom) + "#" + str(stop_pos) + "#" + ref + "#" + alt |
81 dcs_len = [] | 87 dcs_len = [] |
82 if len(ref) == len(alt): | 88 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): |
83 for pileupcolumn in bam.pileup(chrom, stop_pos - 1, stop_pos + 1, max_depth=100000000): | 89 if pileupcolumn.reference_pos == stop_pos: |
84 if pileupcolumn.reference_pos == stop_pos: | 90 count_alt = 0 |
85 count_alt = 0 | 91 count_ref = 0 |
86 count_ref = 0 | 92 count_indel = 0 |
87 count_indel = 0 | 93 count_n = 0 |
88 count_n = 0 | 94 count_other = 0 |
89 count_other = 0 | 95 count_lowq = 0 |
90 count_lowq = 0 | 96 for pileupread in pileupcolumn.pileups: |
91 print("unfiltered reads=", pileupcolumn.n, "filtered reads=", len(pileupcolumn.pileups), | 97 if not pileupread.is_refskip: |
92 "difference= ", len(pileupcolumn.pileups) - pileupcolumn.n) | 98 if pileupread.is_del: |
93 for pileupread in pileupcolumn.pileups: | 99 p = pileupread.query_position_or_next |
94 if not pileupread.is_del and not pileupread.is_refskip: | 100 e = p + len(alt) - 1 |
95 # query position is None if is_del or is_refskip is set. | |
96 nuc = pileupread.alignment.query_sequence[pileupread.query_position] | |
97 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
98 if nuc == alt: | |
99 count_alt += 1 | |
100 tag = pileupread.alignment.query_name | |
101 if tag in tag_dict: | |
102 tag_dict[tag][chrom_stop_pos] = alt | |
103 else: | |
104 tag_dict[tag] = {} | |
105 tag_dict[tag][chrom_stop_pos] = alt | |
106 elif nuc == ref: | |
107 count_ref += 1 | |
108 elif nuc == "N": | |
109 count_n += 1 | |
110 elif nuc == "lowQ": | |
111 count_lowq += 1 | |
112 else: | |
113 count_other += 1 | |
114 else: | 101 else: |
115 count_indel += 1 | 102 p = pileupread.query_position |
116 dcs_median = np.median(np.array(dcs_len)) | 103 e = p + len(alt) |
117 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | 104 s = p |
118 | 105 split_cigar = re.split('(\d+)', pileupread.alignment.cigarstring) |
119 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % | 106 if len(ref) < len(alt): |
120 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, | 107 if "I" in split_cigar: |
121 count_indel, count_lowq, dcs_median)) | 108 all_insertions = [inser_i for inser_i, ins in enumerate(split_cigar) if ins == "I"] |
122 else: | 109 for ai in all_insertions: # if multiple insertions in DCS |
123 print("indels are currently not evaluated") | 110 ins_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] |
111 ins_count = split_cigar[ai - 1] # nr of insertions should match with alt allele | |
112 if "I" in split_cigar and sum(ins_index) == p + 1 and len(alt) - 1 == int(ins_count): # if position in read matches and length of insertion | |
113 nuc = pileupread.alignment.query_sequence[s:e] | |
114 break | |
115 else: | |
116 nuc = pileupread.alignment.query_sequence[s] | |
117 else: | |
118 nuc = pileupread.alignment.query_sequence[s] | |
119 elif len(ref) > len(alt): | |
120 ref_positions = pileupread.alignment.get_reference_positions(full_length=True)[s:p + len(ref)] | |
121 if "D" in split_cigar: | |
122 all_deletions = [del_i for del_i, dele in enumerate(split_cigar) if dele == "D"] | |
123 for di, ai in enumerate(all_deletions): # if multiple insertions in DCS | |
124 if di > 0: # more than 1 deletion, don't count previous deletion to position | |
125 all_deletions_mod = split_cigar[:ai - 1] | |
126 prev_del_idx = [all_deletions_mod.index("D") - 1, all_deletions_mod.index("D")] | |
127 split_cigar_no_prev = [ad for i, ad in enumerate(all_deletions_mod) if i not in prev_del_idx] | |
128 del_index = [int(ci) for ci in split_cigar_no_prev[:ai - 1] if ci.isdigit()] | |
129 else: # first deletion in read, sum all previous (mis)matches and insertions to position | |
130 del_index = [int(ci) for ci in split_cigar[:ai - 1] if ci.isdigit()] | |
131 del_count = split_cigar[ai - 1] # nr of deletions should match with ref allele | |
132 if "D" in split_cigar and sum(del_index) == p + 1 and len(ref) - 1 == int(del_count): | |
133 nuc = pileupread.alignment.query_sequence[s:e] | |
134 if nuc == "": | |
135 nuc = str(alt) | |
136 break | |
137 else: | |
138 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
139 elif len(ref_positions) < len(ref): # DCS has reference but the position is at the very end of the DCS and therefore not the full reference positions are there | |
140 nuc = pileupread.alignment.get_reference_sequence()[s:s + len(ref)] | |
141 if nuc.upper() == ref[:len(nuc)]: | |
142 nuc = str(ref) | |
143 else: | |
144 nuc = pileupread.alignment.query_sequence[s:s + len(ref)] | |
145 else: # SNV: query position is None if is_del or is_refskip is set. | |
146 nuc = pileupread.alignment.query_sequence[s] | |
147 | |
148 nuc = nuc.upper() | |
149 tag = pileupread.alignment.query_name | |
150 if "_" in tag: | |
151 tag = re.split('_', tag)[0] | |
152 | |
153 if nuc == alt: | |
154 count_alt += 1 | |
155 if tag in tag_dict: | |
156 tag_dict[tag][chrom_stop_pos] = alt | |
157 else: | |
158 tag_dict[tag] = {} | |
159 tag_dict[tag][chrom_stop_pos] = alt | |
160 elif nuc == ref: | |
161 count_ref += 1 | |
162 if tag in tag_dict_ref: | |
163 tag_dict_ref[tag][chrom_stop_pos] = ref | |
164 else: | |
165 tag_dict_ref[tag] = {} | |
166 tag_dict_ref[tag][chrom_stop_pos] = ref | |
167 elif nuc == "N": | |
168 count_n += 1 | |
169 elif nuc == "lowQ": | |
170 count_lowq += 1 | |
171 else: | |
172 count_other += 1 | |
173 dcs_len.append(len(pileupread.alignment.query_sequence)) | |
174 | |
175 dcs_median = np.median(np.array(dcs_len)) | |
176 cvrg_dict[chrom_stop_pos] = (count_ref, count_alt, dcs_median) | |
177 print("coverage at pos %s = %s, ref = %s, alt = %s, other bases = %s, N = %s, indel = %s, low quality = %s, median length of DCS = %s\n" % | |
178 (pileupcolumn.pos, count_ref + count_alt, count_ref, count_alt, count_other, count_n, | |
179 count_indel, count_lowq, dcs_median)) | |
124 bam.close() | 180 bam.close() |
125 | 181 |
126 with open(json_file, "w") as f: | 182 with open(json_file, "w") as f: |
127 json.dump((tag_dict, cvrg_dict), f) | 183 json.dump((tag_dict, cvrg_dict, tag_dict_ref), f) |
128 | 184 |
129 # create fastq from aligned reads | 185 # create fastq from aligned reads |
130 with open(outfile, 'w') as out: | 186 with open(outfile, 'w') as out: |
131 with open(file3, 'r') as families: | 187 with open(file3, 'r') as families: |
132 for line in families: | 188 for line in families: |
133 line = line.rstrip('\n') | 189 line = line.rstrip('\n') |
134 splits = line.split('\t') | 190 splits = line.split('\t') |
135 tag = splits[0] | 191 tag = splits[0] |
136 | 192 |
137 if tag in tag_dict: | 193 if tag in tag_dict or tag in tag_dict_ref: |
138 str1 = splits[4] | 194 str1 = splits[4] |
139 curr_seq = str1.replace("-", "") | 195 curr_seq = str1.replace("-", "") |
140 str2 = splits[5] | 196 str2 = splits[5] |
141 curr_qual = str2.replace(" ", "") | 197 curr_qual = str2.replace(" ", "") |
142 | |
143 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") | 198 out.write("@" + splits[0] + "." + splits[1] + "." + splits[2] + "\n") |
144 out.write(curr_seq + "\n") | 199 out.write(curr_seq + "\n") |
145 out.write("+" + "\n") | 200 out.write("+" + "\n") |
146 out.write(curr_qual + "\n") | 201 out.write(curr_qual + "\n") |
147 | 202 |