annotate cravat_convert/vcf_converter.py @ 25:c5f0a7e538ff draft

Uploaded
author in_silico
date Wed, 18 Jul 2018 10:18:36 -0400
parents 4b860b1f92fa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
1 """
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
2 A module originally obtained from the cravat package. Modified to use in the vcf
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
3 converter galaxy tool.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
4
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
5
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
6 Register of changes made (Chris Jacoby):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
7 1) Changed imports as galaxy tool won't have access to complete cravat python package
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
8 2) Defined BadFormatError in BaseConverted file, as I didn't have the BadFormatError module
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
9 """
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
10
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
11 from base_converter import BaseConverter, BadFormatError
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
12 import re
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
13
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
14 class CravatConverter(BaseConverter):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
15
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
16 def __init__(self):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
17 self.format_name = 'vcf'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
18 self.samples = []
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
19 self.var_counter = 0
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
20 self.addl_cols = [{'name':'phred',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
21 'title':'Phred',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
22 'type':'string'},
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
23 {'name':'filter',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
24 'title':'VCF filter',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
25 'type':'string'},
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
26 {'name':'zygosity',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
27 'title':'Zygosity',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
28 'type':'string'},
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
29 {'name':'alt_reads',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
30 'title':'Alternate reads',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
31 'type':'int'},
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
32 {'name':'tot_reads',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
33 'title':'Total reads',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
34 'type':'int'},
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
35 {'name':'af',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
36 'title':'Variant allele frequency',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
37 'type':'float'}]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
38
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
39 def check_format(self, f):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
40 return f.readline().startswith('##fileformat=VCF')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
41
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
42 def setup(self, f):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
43
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
44 vcf_line_no = 0
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
45 for line in f:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
46 vcf_line_no += 1
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
47 if len(line) < 6:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
48 continue
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
49 if line[:6] == '#CHROM':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
50 toks = re.split('\s+', line.rstrip())
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
51 if len(toks) > 8:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
52 self.samples = toks[9:]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
53 break
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
54
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
55 def convert_line(self, l):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
56 if l.startswith('#'): return None
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
57 self.var_counter += 1
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
58 toks = l.strip('\r\n').split('\t')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
59 all_wdicts = []
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
60 if len(toks) < 8:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
61 raise BadFormatError('Wrong VCF format')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
62 [chrom, pos, tag, ref, alts, qual, filter, info] = toks[:8]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
63 if tag == '':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
64 raise BadFormatError('ID column is blank')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
65 elif tag == '.':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
66 tag = 'VAR' + str(self.var_counter)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
67 if chrom[:3] != 'chr':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
68 chrom = 'chr' + chrom
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
69 alts = alts.split(',')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
70 len_alts = len(alts)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
71 if len(toks) == 8:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
72 for altno in range(len_alts):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
73 wdict = None
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
74 alt = alts[altno]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
75 newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
76 wdict = {'tags':tag,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
77 'chrom':chrom,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
78 'pos':newpos,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
79 'ref_base':newref,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
80 'alt_base':newalt,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
81 'sample_id':'no_sample',
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
82 'phred': qual,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
83 'filter': filter}
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
84 all_wdicts.append(wdict)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
85 elif len(toks) > 8:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
86 sample_datas = toks[9:]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
87 genotype_fields = {}
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
88 genotype_field_no = 0
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
89 for genotype_field in toks[8].split(':'):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
90 genotype_fields[genotype_field] = genotype_field_no
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
91 genotype_field_no += 1
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
92 if not ('GT' in genotype_fields):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
93 raise BadFormatError('No GT Field')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
94 gt_field_no = genotype_fields['GT']
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
95 for sample_no in range(len(sample_datas)):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
96 sample = self.samples[sample_no]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
97 sample_data = sample_datas[sample_no].split(':')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
98 gts = {}
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
99 for gt in sample_data[gt_field_no].replace('/', '|').split('|'):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
100 if gt == '.':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
101 continue
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
102 else:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
103 gts[int(gt)] = True
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
104 for gt in sorted(gts.keys()):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
105 wdict = None
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
106 if gt == 0:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
107 continue
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
108 else:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
109 alt = alts[gt - 1]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
110 newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
111 zyg = self.homo_hetro(sample_data[gt_field_no])
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
112 depth, alt_reads, af = self.extract_read_info(sample_data, gt, gts, genotype_fields)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
113
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
114 wdict = {'tags':tag,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
115 'chrom':chrom,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
116 'pos':newpos,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
117 'ref_base':newref,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
118 'alt_base':newalt,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
119 'sample_id':sample,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
120 'phred': qual,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
121 'filter': filter,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
122 'zygosity': zyg,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
123 'tot_reads': depth,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
124 'alt_reads': alt_reads,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
125 'af': af,
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
126 }
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
127 all_wdicts.append(wdict)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
128 return all_wdicts
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
129
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
130 #The vcf genotype string has a call for each allele separated by '\' or '/'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
131 #If the call is the same for all allels, return 'hom' otherwise 'het'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
132 def homo_hetro(self, gt_str):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
133 if '.' in gt_str:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
134 return '';
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
135
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
136 gts = gt_str.strip().replace('/', '|').split('|')
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
137 for gt in gts:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
138 if gt != gts[0]:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
139 return 'het'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
140 return 'hom'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
141
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
142 #Extract read depth, allele count, and allele frequency from optional VCR information
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
143 def extract_read_info (self, sample_data, gt, gts, genotype_fields):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
144 depth = ''
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
145 alt_reads = ''
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
146 ref_reads = ''
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
147 af = ''
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
148
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
149 #AD contains 2 values usually ref count and alt count unless there are
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
150 #multiple alts then it will have alt 1 then alt 2.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
151 if 'AD' in genotype_fields and genotype_fields['AD'] <= len(sample_data):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
152 if 0 in gts.keys():
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
153 #if part of the genotype is reference, then AD will have #ref reads, #alt reads
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
154 ref_reads = sample_data[genotype_fields['AD']].split(',')[0]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
155 alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
156 elif gt == max(gts.keys()):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
157 #if geontype has multiple alt bases, then AD will have #alt1 reads, #alt2 reads
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
158 alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
159 else:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
160 alt_reads = sample_data[genotype_fields['AD']].split(',')[0]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
161
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
162 if 'DP' in genotype_fields and genotype_fields['DP'] <= len(sample_data):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
163 depth = sample_data[genotype_fields['DP']]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
164 elif alt_reads != '' and ref_reads != '':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
165 #if DP is not present but we have alt and ref reads count, dp = ref+alt
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
166 depth = int(alt_reads) + int(ref_reads)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
167
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
168 if 'AF' in genotype_fields and genotype_fields['AF'] <= len(sample_data):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
169 af = float(sample_data[genotype_fields['AF']] )
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
170 elif depth != '' and alt_reads != '':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
171 #if AF not specified, calc it from alt and ref reads
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
172 af = float(alt_reads) / float(depth)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
173
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
174 return depth, alt_reads, af
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
175
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
176 def extract_vcf_variant (self, strand, pos, ref, alt):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
177
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
178 reflen = len(ref)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
179 altlen = len(alt)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
180
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
181 # Returns without change if same single nucleotide for ref and alt.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
182 if reflen == 1 and altlen == 1 and ref == alt:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
183 return pos, ref, alt
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
184
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
185 # Trimming from the start and then the end of the sequence
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
186 # where the sequences overlap with the same nucleotides
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
187 new_ref2, new_alt2, new_pos = \
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
188 self.trimming_vcf_input(ref, alt, pos, strand)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
189
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
190 if new_ref2 == '':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
191 new_ref2 = '-'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
192 if new_alt2 == '':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
193 new_alt2 = '-'
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
194
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
195 return new_pos, new_ref2, new_alt2
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
196
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
197 # This function looks at the ref and alt sequences and removes
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
198 # where the overlapping sequences contain the same nucleotide.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
199 # This trims from the end first but does not remove the first nucleotide
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
200 # because based on the format of VCF input the
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
201 # first nucleotide of the ref and alt sequence occur
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
202 # at the position specified.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
203 # End removed first, not the first nucleotide
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
204 # Front removed and position changed
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
205 def trimming_vcf_input(self, ref, alt, pos, strand):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
206 pos = int(pos)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
207 reflen = len(ref)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
208 altlen = len(alt)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
209 minlen = min(reflen, altlen)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
210 new_ref = ref
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
211 new_alt = alt
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
212 new_pos = pos
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
213 # Trims from the end. Except don't remove the first nucleotide.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
214 # 1:6530968 CTCA -> GTCTCA becomes C -> GTC.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
215 for nt_pos in range(0, minlen - 1):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
216 if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
217 new_ref = ref[:reflen - nt_pos - 1]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
218 new_alt = alt[:altlen - nt_pos - 1]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
219 else:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
220 break
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
221 new_ref_len = len(new_ref)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
222 new_alt_len = len(new_alt)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
223 minlen = min(new_ref_len, new_alt_len)
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
224 new_ref2 = new_ref
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
225 new_alt2 = new_alt
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
226 # Trims from the start. 1:6530968 G -> GT becomes 1:6530969 - -> T.
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
227 for nt_pos in range(0, minlen):
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
228 if new_ref[nt_pos] == new_alt[nt_pos]:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
229 if strand == '+':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
230 new_pos += 1
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
231 elif strand == '-':
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
232 new_pos -= 1
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
233 new_ref2 = new_ref[nt_pos + 1:]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
234 new_alt2 = new_alt[nt_pos + 1:]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
235 else:
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
236 new_ref2 = new_ref[nt_pos:]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
237 new_alt2 = new_alt[nt_pos:]
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
238 break
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
239 return new_ref2, new_alt2, new_pos
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
240
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
241
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
242 if __name__ == "__main__":
4b860b1f92fa Uploaded
in_silico
parents:
diff changeset
243 c = CravatConverter()