comparison cravat_convert/vcf_converter.py @ 18:cde9e74d9fdf draft

Uploaded
author in_silico
date Wed, 18 Jul 2018 09:41:20 -0400
parents c042835a7163
children
comparison
equal deleted inserted replaced
17:bea3b63f4b5d 18:cde9e74d9fdf
1 """
2 A module originally obtained from the cravat package. Modified to use in the vcf
3 converter galaxy tool.
4
5
6 Register of changes made (Chris Jacoby):
7 1) Changed imports as galaxy tool won't have access to complete cravat python package
8 2) Defined BadFormatError in BaseConverted file, as I didn't have the BadFormatError module
9 """
10
11 from base_converter import BaseConverter, BadFormatError
12 import re
13
14 class CravatConverter(BaseConverter):
15
16 def __init__(self):
17 self.format_name = 'vcf'
18 self.samples = []
19 self.var_counter = 0
20 self.addl_cols = [{'name':'phred',
21 'title':'Phred',
22 'type':'string'},
23 {'name':'filter',
24 'title':'VCF filter',
25 'type':'string'},
26 {'name':'zygosity',
27 'title':'Zygosity',
28 'type':'string'},
29 {'name':'alt_reads',
30 'title':'Alternate reads',
31 'type':'int'},
32 {'name':'tot_reads',
33 'title':'Total reads',
34 'type':'int'},
35 {'name':'af',
36 'title':'Variant allele frequency',
37 'type':'float'}]
38
39 def check_format(self, f):
40 return f.readline().startswith('##fileformat=VCF')
41
42 def setup(self, f):
43
44 vcf_line_no = 0
45 for line in f:
46 vcf_line_no += 1
47 if len(line) < 6:
48 continue
49 if line[:6] == '#CHROM':
50 toks = re.split('\s+', line.rstrip())
51 if len(toks) > 8:
52 self.samples = toks[9:]
53 break
54
55 def convert_line(self, l):
56 if l.startswith('#'): return None
57 self.var_counter += 1
58 toks = l.strip('\r\n').split('\t')
59 all_wdicts = []
60 if len(toks) < 8:
61 raise BadFormatError('Wrong VCF format')
62 [chrom, pos, tag, ref, alts, qual, filter, info] = toks[:8]
63 if tag == '':
64 raise BadFormatError('ID column is blank')
65 elif tag == '.':
66 tag = 'VAR' + str(self.var_counter)
67 if chrom[:3] != 'chr':
68 chrom = 'chr' + chrom
69 alts = alts.split(',')
70 len_alts = len(alts)
71 if len(toks) == 8:
72 for altno in range(len_alts):
73 wdict = None
74 alt = alts[altno]
75 newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
76 wdict = {'tags':tag,
77 'chrom':chrom,
78 'pos':newpos,
79 'ref_base':newref,
80 'alt_base':newalt,
81 'sample_id':'no_sample',
82 'phred': qual,
83 'filter': filter}
84 all_wdicts.append(wdict)
85 elif len(toks) > 8:
86 sample_datas = toks[9:]
87 genotype_fields = {}
88 genotype_field_no = 0
89 for genotype_field in toks[8].split(':'):
90 genotype_fields[genotype_field] = genotype_field_no
91 genotype_field_no += 1
92 if not ('GT' in genotype_fields):
93 raise BadFormatError('No GT Field')
94 gt_field_no = genotype_fields['GT']
95 for sample_no in range(len(sample_datas)):
96 sample = self.samples[sample_no]
97 sample_data = sample_datas[sample_no].split(':')
98 gts = {}
99 for gt in sample_data[gt_field_no].replace('/', '|').split('|'):
100 if gt == '.':
101 continue
102 else:
103 gts[int(gt)] = True
104 for gt in sorted(gts.keys()):
105 wdict = None
106 if gt == 0:
107 continue
108 else:
109 alt = alts[gt - 1]
110 newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
111 zyg = self.homo_hetro(sample_data[gt_field_no])
112 depth, alt_reads, af = self.extract_read_info(sample_data, gt, gts, genotype_fields)
113
114 wdict = {'tags':tag,
115 'chrom':chrom,
116 'pos':newpos,
117 'ref_base':newref,
118 'alt_base':newalt,
119 'sample_id':sample,
120 'phred': qual,
121 'filter': filter,
122 'zygosity': zyg,
123 'tot_reads': depth,
124 'alt_reads': alt_reads,
125 'af': af,
126 }
127 all_wdicts.append(wdict)
128 return all_wdicts
129
130 #The vcf genotype string has a call for each allele separated by '\' or '/'
131 #If the call is the same for all allels, return 'hom' otherwise 'het'
132 def homo_hetro(self, gt_str):
133 if '.' in gt_str:
134 return '';
135
136 gts = gt_str.strip().replace('/', '|').split('|')
137 for gt in gts:
138 if gt != gts[0]:
139 return 'het'
140 return 'hom'
141
142 #Extract read depth, allele count, and allele frequency from optional VCR information
143 def extract_read_info (self, sample_data, gt, gts, genotype_fields):
144 depth = ''
145 alt_reads = ''
146 ref_reads = ''
147 af = ''
148
149 #AD contains 2 values usually ref count and alt count unless there are
150 #multiple alts then it will have alt 1 then alt 2.
151 if 'AD' in genotype_fields and genotype_fields['AD'] <= len(sample_data):
152 if 0 in gts.keys():
153 #if part of the genotype is reference, then AD will have #ref reads, #alt reads
154 ref_reads = sample_data[genotype_fields['AD']].split(',')[0]
155 alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
156 elif gt == max(gts.keys()):
157 #if geontype has multiple alt bases, then AD will have #alt1 reads, #alt2 reads
158 alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
159 else:
160 alt_reads = sample_data[genotype_fields['AD']].split(',')[0]
161
162 if 'DP' in genotype_fields and genotype_fields['DP'] <= len(sample_data):
163 depth = sample_data[genotype_fields['DP']]
164 elif alt_reads != '' and ref_reads != '':
165 #if DP is not present but we have alt and ref reads count, dp = ref+alt
166 depth = int(alt_reads) + int(ref_reads)
167
168 if 'AF' in genotype_fields and genotype_fields['AF'] <= len(sample_data):
169 af = float(sample_data[genotype_fields['AF']] )
170 elif depth != '' and alt_reads != '':
171 #if AF not specified, calc it from alt and ref reads
172 af = float(alt_reads) / float(depth)
173
174 return depth, alt_reads, af
175
176 def extract_vcf_variant (self, strand, pos, ref, alt):
177
178 reflen = len(ref)
179 altlen = len(alt)
180
181 # Returns without change if same single nucleotide for ref and alt.
182 if reflen == 1 and altlen == 1 and ref == alt:
183 return pos, ref, alt
184
185 # Trimming from the start and then the end of the sequence
186 # where the sequences overlap with the same nucleotides
187 new_ref2, new_alt2, new_pos = \
188 self.trimming_vcf_input(ref, alt, pos, strand)
189
190 if new_ref2 == '':
191 new_ref2 = '-'
192 if new_alt2 == '':
193 new_alt2 = '-'
194
195 return new_pos, new_ref2, new_alt2
196
197 # This function looks at the ref and alt sequences and removes
198 # where the overlapping sequences contain the same nucleotide.
199 # This trims from the end first but does not remove the first nucleotide
200 # because based on the format of VCF input the
201 # first nucleotide of the ref and alt sequence occur
202 # at the position specified.
203 # End removed first, not the first nucleotide
204 # Front removed and position changed
205 def trimming_vcf_input(self, ref, alt, pos, strand):
206 pos = int(pos)
207 reflen = len(ref)
208 altlen = len(alt)
209 minlen = min(reflen, altlen)
210 new_ref = ref
211 new_alt = alt
212 new_pos = pos
213 # Trims from the end. Except don't remove the first nucleotide.
214 # 1:6530968 CTCA -> GTCTCA becomes C -> GTC.
215 for nt_pos in range(0, minlen - 1):
216 if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]:
217 new_ref = ref[:reflen - nt_pos - 1]
218 new_alt = alt[:altlen - nt_pos - 1]
219 else:
220 break
221 new_ref_len = len(new_ref)
222 new_alt_len = len(new_alt)
223 minlen = min(new_ref_len, new_alt_len)
224 new_ref2 = new_ref
225 new_alt2 = new_alt
226 # Trims from the start. 1:6530968 G -> GT becomes 1:6530969 - -> T.
227 for nt_pos in range(0, minlen):
228 if new_ref[nt_pos] == new_alt[nt_pos]:
229 if strand == '+':
230 new_pos += 1
231 elif strand == '-':
232 new_pos -= 1
233 new_ref2 = new_ref[nt_pos + 1:]
234 new_alt2 = new_alt[nt_pos + 1:]
235 else:
236 new_ref2 = new_ref[nt_pos:]
237 new_alt2 = new_alt[nt_pos:]
238 break
239 return new_ref2, new_alt2, new_pos
240
241
242 if __name__ == "__main__":
243 c = CravatConverter()