Mercurial > repos > in_silico > cravat_vcf_convert
comparison cravat_convert/vcf_converter.py @ 18:cde9e74d9fdf draft
Uploaded
author | in_silico |
---|---|
date | Wed, 18 Jul 2018 09:41:20 -0400 |
parents | c042835a7163 |
children |
comparison
equal
deleted
inserted
replaced
17:bea3b63f4b5d | 18:cde9e74d9fdf |
---|---|
1 """ | |
2 A module originally obtained from the cravat package. Modified to use in the vcf | |
3 converter galaxy tool. | |
4 | |
5 | |
6 Register of changes made (Chris Jacoby): | |
7 1) Changed imports as galaxy tool won't have access to complete cravat python package | |
8 2) Defined BadFormatError in BaseConverted file, as I didn't have the BadFormatError module | |
9 """ | |
10 | |
11 from base_converter import BaseConverter, BadFormatError | |
12 import re | |
13 | |
14 class CravatConverter(BaseConverter): | |
15 | |
16 def __init__(self): | |
17 self.format_name = 'vcf' | |
18 self.samples = [] | |
19 self.var_counter = 0 | |
20 self.addl_cols = [{'name':'phred', | |
21 'title':'Phred', | |
22 'type':'string'}, | |
23 {'name':'filter', | |
24 'title':'VCF filter', | |
25 'type':'string'}, | |
26 {'name':'zygosity', | |
27 'title':'Zygosity', | |
28 'type':'string'}, | |
29 {'name':'alt_reads', | |
30 'title':'Alternate reads', | |
31 'type':'int'}, | |
32 {'name':'tot_reads', | |
33 'title':'Total reads', | |
34 'type':'int'}, | |
35 {'name':'af', | |
36 'title':'Variant allele frequency', | |
37 'type':'float'}] | |
38 | |
39 def check_format(self, f): | |
40 return f.readline().startswith('##fileformat=VCF') | |
41 | |
42 def setup(self, f): | |
43 | |
44 vcf_line_no = 0 | |
45 for line in f: | |
46 vcf_line_no += 1 | |
47 if len(line) < 6: | |
48 continue | |
49 if line[:6] == '#CHROM': | |
50 toks = re.split('\s+', line.rstrip()) | |
51 if len(toks) > 8: | |
52 self.samples = toks[9:] | |
53 break | |
54 | |
55 def convert_line(self, l): | |
56 if l.startswith('#'): return None | |
57 self.var_counter += 1 | |
58 toks = l.strip('\r\n').split('\t') | |
59 all_wdicts = [] | |
60 if len(toks) < 8: | |
61 raise BadFormatError('Wrong VCF format') | |
62 [chrom, pos, tag, ref, alts, qual, filter, info] = toks[:8] | |
63 if tag == '': | |
64 raise BadFormatError('ID column is blank') | |
65 elif tag == '.': | |
66 tag = 'VAR' + str(self.var_counter) | |
67 if chrom[:3] != 'chr': | |
68 chrom = 'chr' + chrom | |
69 alts = alts.split(',') | |
70 len_alts = len(alts) | |
71 if len(toks) == 8: | |
72 for altno in range(len_alts): | |
73 wdict = None | |
74 alt = alts[altno] | |
75 newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt) | |
76 wdict = {'tags':tag, | |
77 'chrom':chrom, | |
78 'pos':newpos, | |
79 'ref_base':newref, | |
80 'alt_base':newalt, | |
81 'sample_id':'no_sample', | |
82 'phred': qual, | |
83 'filter': filter} | |
84 all_wdicts.append(wdict) | |
85 elif len(toks) > 8: | |
86 sample_datas = toks[9:] | |
87 genotype_fields = {} | |
88 genotype_field_no = 0 | |
89 for genotype_field in toks[8].split(':'): | |
90 genotype_fields[genotype_field] = genotype_field_no | |
91 genotype_field_no += 1 | |
92 if not ('GT' in genotype_fields): | |
93 raise BadFormatError('No GT Field') | |
94 gt_field_no = genotype_fields['GT'] | |
95 for sample_no in range(len(sample_datas)): | |
96 sample = self.samples[sample_no] | |
97 sample_data = sample_datas[sample_no].split(':') | |
98 gts = {} | |
99 for gt in sample_data[gt_field_no].replace('/', '|').split('|'): | |
100 if gt == '.': | |
101 continue | |
102 else: | |
103 gts[int(gt)] = True | |
104 for gt in sorted(gts.keys()): | |
105 wdict = None | |
106 if gt == 0: | |
107 continue | |
108 else: | |
109 alt = alts[gt - 1] | |
110 newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt) | |
111 zyg = self.homo_hetro(sample_data[gt_field_no]) | |
112 depth, alt_reads, af = self.extract_read_info(sample_data, gt, gts, genotype_fields) | |
113 | |
114 wdict = {'tags':tag, | |
115 'chrom':chrom, | |
116 'pos':newpos, | |
117 'ref_base':newref, | |
118 'alt_base':newalt, | |
119 'sample_id':sample, | |
120 'phred': qual, | |
121 'filter': filter, | |
122 'zygosity': zyg, | |
123 'tot_reads': depth, | |
124 'alt_reads': alt_reads, | |
125 'af': af, | |
126 } | |
127 all_wdicts.append(wdict) | |
128 return all_wdicts | |
129 | |
130 #The vcf genotype string has a call for each allele separated by '\' or '/' | |
131 #If the call is the same for all allels, return 'hom' otherwise 'het' | |
132 def homo_hetro(self, gt_str): | |
133 if '.' in gt_str: | |
134 return ''; | |
135 | |
136 gts = gt_str.strip().replace('/', '|').split('|') | |
137 for gt in gts: | |
138 if gt != gts[0]: | |
139 return 'het' | |
140 return 'hom' | |
141 | |
142 #Extract read depth, allele count, and allele frequency from optional VCR information | |
143 def extract_read_info (self, sample_data, gt, gts, genotype_fields): | |
144 depth = '' | |
145 alt_reads = '' | |
146 ref_reads = '' | |
147 af = '' | |
148 | |
149 #AD contains 2 values usually ref count and alt count unless there are | |
150 #multiple alts then it will have alt 1 then alt 2. | |
151 if 'AD' in genotype_fields and genotype_fields['AD'] <= len(sample_data): | |
152 if 0 in gts.keys(): | |
153 #if part of the genotype is reference, then AD will have #ref reads, #alt reads | |
154 ref_reads = sample_data[genotype_fields['AD']].split(',')[0] | |
155 alt_reads = sample_data[genotype_fields['AD']].split(',')[1] | |
156 elif gt == max(gts.keys()): | |
157 #if geontype has multiple alt bases, then AD will have #alt1 reads, #alt2 reads | |
158 alt_reads = sample_data[genotype_fields['AD']].split(',')[1] | |
159 else: | |
160 alt_reads = sample_data[genotype_fields['AD']].split(',')[0] | |
161 | |
162 if 'DP' in genotype_fields and genotype_fields['DP'] <= len(sample_data): | |
163 depth = sample_data[genotype_fields['DP']] | |
164 elif alt_reads != '' and ref_reads != '': | |
165 #if DP is not present but we have alt and ref reads count, dp = ref+alt | |
166 depth = int(alt_reads) + int(ref_reads) | |
167 | |
168 if 'AF' in genotype_fields and genotype_fields['AF'] <= len(sample_data): | |
169 af = float(sample_data[genotype_fields['AF']] ) | |
170 elif depth != '' and alt_reads != '': | |
171 #if AF not specified, calc it from alt and ref reads | |
172 af = float(alt_reads) / float(depth) | |
173 | |
174 return depth, alt_reads, af | |
175 | |
176 def extract_vcf_variant (self, strand, pos, ref, alt): | |
177 | |
178 reflen = len(ref) | |
179 altlen = len(alt) | |
180 | |
181 # Returns without change if same single nucleotide for ref and alt. | |
182 if reflen == 1 and altlen == 1 and ref == alt: | |
183 return pos, ref, alt | |
184 | |
185 # Trimming from the start and then the end of the sequence | |
186 # where the sequences overlap with the same nucleotides | |
187 new_ref2, new_alt2, new_pos = \ | |
188 self.trimming_vcf_input(ref, alt, pos, strand) | |
189 | |
190 if new_ref2 == '': | |
191 new_ref2 = '-' | |
192 if new_alt2 == '': | |
193 new_alt2 = '-' | |
194 | |
195 return new_pos, new_ref2, new_alt2 | |
196 | |
197 # This function looks at the ref and alt sequences and removes | |
198 # where the overlapping sequences contain the same nucleotide. | |
199 # This trims from the end first but does not remove the first nucleotide | |
200 # because based on the format of VCF input the | |
201 # first nucleotide of the ref and alt sequence occur | |
202 # at the position specified. | |
203 # End removed first, not the first nucleotide | |
204 # Front removed and position changed | |
205 def trimming_vcf_input(self, ref, alt, pos, strand): | |
206 pos = int(pos) | |
207 reflen = len(ref) | |
208 altlen = len(alt) | |
209 minlen = min(reflen, altlen) | |
210 new_ref = ref | |
211 new_alt = alt | |
212 new_pos = pos | |
213 # Trims from the end. Except don't remove the first nucleotide. | |
214 # 1:6530968 CTCA -> GTCTCA becomes C -> GTC. | |
215 for nt_pos in range(0, minlen - 1): | |
216 if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]: | |
217 new_ref = ref[:reflen - nt_pos - 1] | |
218 new_alt = alt[:altlen - nt_pos - 1] | |
219 else: | |
220 break | |
221 new_ref_len = len(new_ref) | |
222 new_alt_len = len(new_alt) | |
223 minlen = min(new_ref_len, new_alt_len) | |
224 new_ref2 = new_ref | |
225 new_alt2 = new_alt | |
226 # Trims from the start. 1:6530968 G -> GT becomes 1:6530969 - -> T. | |
227 for nt_pos in range(0, minlen): | |
228 if new_ref[nt_pos] == new_alt[nt_pos]: | |
229 if strand == '+': | |
230 new_pos += 1 | |
231 elif strand == '-': | |
232 new_pos -= 1 | |
233 new_ref2 = new_ref[nt_pos + 1:] | |
234 new_alt2 = new_alt[nt_pos + 1:] | |
235 else: | |
236 new_ref2 = new_ref[nt_pos:] | |
237 new_alt2 = new_alt[nt_pos:] | |
238 break | |
239 return new_ref2, new_alt2, new_pos | |
240 | |
241 | |
242 if __name__ == "__main__": | |
243 c = CravatConverter() |