Mercurial > repos > jjohnson > snpeff_cds_report
annotate snpEff_cds_report.py @ 4:429a6b4df5e5
Restrict to info items that are key=value pairs
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 24 Apr 2013 08:00:04 -0500 |
parents | 15a54fa11ad7 |
children | 85b933b7d231 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/python |
2 import sys,os,tempfile,re | |
3 import httplib, urllib | |
4 import optparse | |
5 #import vcfClass | |
6 #from vcfClass import * | |
7 #import tools | |
8 #from tools import * | |
9 | |
10 debug = False | |
11 cds_anchor = 'cds_variation' | |
12 aa_anchor = 'aa_variation' | |
13 | |
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", | |
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", | |
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", | |
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", | |
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", | |
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", | |
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | |
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", | |
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", | |
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", | |
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", | |
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", | |
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", | |
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", | |
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", | |
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | |
30 | |
31 def reverseComplement(seq) : | |
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper() | |
33 return rev | |
34 | |
35 def translate(seq) : | |
36 rna = seq.upper().replace('T','U') | |
37 aa = [] | |
38 for i in range(0,len(rna) - 2, 3): | |
39 aa.append(codon_map[rna[i:i+3]]) | |
40 return ''.join(aa) | |
41 | |
42 """ | |
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand | |
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes. | |
45 """ | |
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?> | |
47 <!DOCTYPE Query> | |
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > | |
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
51 <Filter name = "with_ccds" excluded = "0"/> | |
52 <Attribute name = "ensembl_transcript_id" /> | |
53 <Attribute name = "ccds" /> | |
54 </Dataset> | |
55 </Query> | |
56 """ | |
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>' | |
58 query_template = """<?xml version="1.0" encoding="UTF-8"?> | |
59 <!DOCTYPE Query> | |
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" > | |
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > | |
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> | |
63 <Filter name = "biotype" value = "protein_coding"/> | |
64 __CCDS_FILTER_HERE__ | |
65 <Attribute name = "cdna" /> | |
66 <Attribute name = "ensembl_gene_id" /> | |
67 <Attribute name = "ensembl_transcript_id" /> | |
68 <Attribute name = "strand" /> | |
69 <Attribute name = "transcript_start" /> | |
70 <Attribute name = "transcript_end"/> | |
71 <Attribute name = "exon_chrom_start"/> | |
72 <Attribute name = "exon_chrom_end"/> | |
73 <Attribute name = "cdna_coding_start" /> | |
74 <Attribute name = "cdna_coding_end" /> | |
75 <Attribute name = "cds_length"/> | |
76 <Attribute name = "rank"/> | |
77 <Attribute name = "5_utr_start" /> | |
78 <Attribute name = "5_utr_end" /> | |
79 <Attribute name = "3_utr_start" /> | |
80 <Attribute name = "3_utr_end" /> | |
81 <Attribute name = "ensembl_peptide_id"/> | |
82 <Attribute name = "start_position" /> | |
83 <Attribute name = "end_position" /> | |
84 </Dataset> | |
85 </Query> | |
86 """ | |
87 """ | |
88 Columns(19): | |
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)'] | |
90 | |
91 0 cDNA sequence | |
92 1 Ensembl Gene ID | |
93 2 Ensembl Transcript ID | |
94 3 Strand | |
95 - 4 Transcript Start (bp) | |
96 - 5 Transcript End (bp) | |
97 6 Exon Chr Start (bp) | |
98 7 Exon Chr End (bp) | |
99 8 CDS Start | |
100 9 CDS End | |
101 10 CDS Length | |
102 - 11 Exon Rank in Transcript | |
103 12 5' UTR Start | |
104 13 5' UTR End | |
105 14 3' UTR Start | |
106 15 3' UTR End | |
107 - 16 Ensembl Protein ID | |
108 - 17 Gene Start (bp) | |
109 - 18 Gene End (bp) | |
110 """ | |
111 | |
112 # return {transcript_id : ccds_id} | |
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids): | |
114 ccds_dict = dict() | |
115 if transcript_ids: | |
116 query = query_ccds_template | |
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query) | |
119 params = urllib.urlencode({'query':query}) | |
120 headers = {"Accept": "text/plain"} | |
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query) | |
122 try: | |
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
125 conn.request("POST", "/biomart/martservice", params, headers) | |
126 response = conn.getresponse() | |
127 data = response.read() | |
128 if len(data) > 0: | |
129 # if debug: print >> sys.stdout, "%s\n\n" % data | |
130 lines = data.split('\n') | |
131 for line in lines: | |
132 fields = line.split('\t') | |
133 if len(fields) == 2: | |
134 (transcript_id,ccds) = fields | |
135 ccds_dict[transcript_id] = ccds | |
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines) | |
137 except Exception, e: | |
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) ) | |
139 return ccds_dict | |
140 | |
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True): | |
142 query = query_template | |
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) | |
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query) | |
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query) | |
146 return query | |
147 | |
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence] | |
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True): | |
150 transcript_id = snpEffect.transcript | |
151 chr_pos = snpEffect.pos | |
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds) | |
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query) | |
154 params = urllib.urlencode({'query':query}) | |
155 headers = {"Accept": "text/plain"} | |
156 pos = snpEffect.pos | |
157 cds_pos = None # zero based offset | |
158 coding_sequence = '' | |
159 cds_ref = snpEffect.ref | |
160 cds_alt = snpEffect.alt | |
161 try: | |
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) | |
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') | |
164 conn.request("POST", "/biomart/martservice", params, headers) | |
165 response = conn.getresponse() | |
166 data = response.read() | |
167 if len(data) > 0: | |
168 # if debug: print >> sys.stdout, "%s\n\n" % data | |
169 lines = data.split('\n') | |
170 # Use the header line to determine the order of fields | |
171 line = lines[0] | |
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line | |
173 colnames = line.split('\t') | |
174 # for i in range(len(colnames)): | |
175 # | |
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames) | |
177 for line in lines[1:]: | |
178 if line == None or len(line) < 2: | |
179 continue | |
180 field = line.split('\t') | |
181 if len(field) != len(colnames): | |
182 continue | |
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line) | |
184 if field[10] == None or len(field[10]) < 1: | |
185 continue | |
186 if debug: | |
187 for i in range(len(colnames)): | |
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i]) | |
189 snpEffect.gene_id = field[1] | |
190 strand = '+' if int(field[3]) > 0 else '-' | |
191 snpEffect.strand = strand | |
192 # coding_sequence is first | |
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]): | |
194 if field[10] == None or len(field[10]) < 1: | |
195 continue | |
196 cdna_seq = field[0] | |
197 in_utr = False | |
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence | |
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-')) | |
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-')) | |
201 utr_5_lens = [] | |
202 for i in range(len(utr_5_starts)): | |
203 utr_5_start = int(utr_5_starts[i]) | |
204 utr_5_end = int(utr_5_ends[i]) | |
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1) | |
206 if utr_5_start <= pos <= utr_5_end : | |
207 in_utr = True | |
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end); | |
209 break | |
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-')) | |
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-')) | |
212 for i in range(len(utr_3_starts)): | |
213 utr_3_start = int(utr_3_starts[i]) | |
214 utr_3_end = int(utr_3_ends[i]) | |
215 if utr_3_start <= pos <= utr_3_end : | |
216 in_utr = True | |
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end); | |
218 break | |
219 # Get coding sequence from cdna | |
220 cdna_length = int(field[10]) | |
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']')) | |
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']')) | |
223 cdna_lens = [] | |
224 for i in range(len(cdna_coding_start)): | |
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1) | |
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet | |
228 for i in range(len(cdna_coding_start)): | |
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1] | |
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]] | |
231 snpEffect.cds = coding_sequence | |
232 if coding_sequence and len(coding_sequence) >= 3: | |
233 snpEffect.cds_stop_codon = coding_sequence[-3:] | |
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence) | |
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) | |
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens) | |
237 if not in_utr: | |
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-')) | |
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-')) | |
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-')) | |
241 exon_lens = [] | |
242 for i in range(len(exon_start)): | |
243 exon_lens.append(exon_end[i] - exon_start[i] + 1) | |
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens) | |
245 if debug: | |
246 bp_tot = 0 | |
247 for i in range(len(exon_start)): | |
248 exon_len = exon_end[i] - exon_start[i] + 1 | |
249 bp_tot += exon_len | |
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot) | |
251 cds_idx = 0 | |
252 for i in range(len(exon_start)): | |
253 # Does this exon have cds bases - i.e. not entirely in UTR | |
254 if len(utr_5_ends) > 0: | |
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]: | |
256 continue | |
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]: | |
258 continue | |
259 exon_len = exon_end[i] - exon_start[i] + 1 | |
260 if exon_start[i] <= pos <= exon_end[i]: | |
261 if strand == '+': | |
262 if cds_idx: # find offset from start of cdna_coding and exon | |
263 offset = pos - exon_start[i] | |
264 else: # find offset from end of cdna_coding and exon | |
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) ) | |
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset | |
267 else: # negative strand | |
268 cds_ref = reverseComplement(snpEffect.ref) | |
269 cds_alt = reverseComplement(snpEffect.alt) | |
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos | |
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1) | |
272 snpEffect.cds_pos = cds_pos | |
273 snpEffect.cds_ref = cds_ref | |
274 snpEffect.cds_alt = cds_alt | |
275 return snpEffect | |
276 else: | |
277 cds_idx += 1 | |
278 except Exception, e: | |
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) ) | |
280 finally: | |
281 if conn != None : | |
282 conn.close() | |
283 return None | |
284 | |
285 """ | |
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
288 """ | |
289 class SnpEffect( object ): | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation'] |
0 | 291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False): |
292 self.chrom = chrom | |
293 self.pos = int(pos) | |
294 self.ref = ref | |
295 self.alt = alt | |
296 self.freq = float(freq) if freq else None | |
297 self.depth = int(depth) if depth else None | |
298 # From SnpEffect VCF String | |
299 self.effect = None | |
300 self.effect_impact = None | |
301 self.functional_class = None | |
302 self.codon_change = None | |
303 self.amino_acid_change = None | |
304 self.amino_acid_length = None | |
305 self.gene_name = None | |
306 self.gene_id = None | |
307 self.gene_biotype = None | |
308 self.coding = None | |
309 self.transcript = None | |
310 self.transcript_ids = None | |
311 self.exon = None | |
312 self.cds_stop_codon = None | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
313 self.cds_stop_pos = None |
0 | 314 # retrieve from ensembl |
315 self.strand = None | |
316 self.cds_pos = None | |
317 self.cds_ref = None | |
318 self.cds_alt = None | |
319 self.cds = None | |
320 self.aa_pos = None | |
321 self.aa_len = None | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
322 self.alt_stop_pos = None |
0 | 323 self.alt_stop_codon = None |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
324 self.alt_stop_region = None ## includes base before and after alt_stop_codon |
0 | 325 def chrPos(self): |
326 return "%s:%s" % (self.chrom,self.pos) | |
327 def setEffect(self, effect, snpEffVersion = None): | |
328 if snpEffVersion and snpEffVersion == '2': | |
329 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9] | |
330 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
331 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10] | |
332 self.amino_acid_length = amino_acid_length | |
333 self.effect_impact = effect_impact | |
334 self.functional_class = functional_class | |
335 self.codon_change = codon_change | |
336 self.amino_acid_change = amino_acid_change | |
337 self.gene_name = gene_name | |
338 self.gene_biotype = gene_biotype | |
339 self.coding = coding | |
340 self.transcript = transcript | |
341 self.exon = exon | |
342 def parseEffect(self, effect, snpEffVersion = None): | |
343 (eff,effs) = effect.rstrip(')').split('(') | |
344 self.effect = eff | |
345 eff_fields = effs.split('|') | |
346 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields) | |
347 self.setEffect(eff_fields, snpEffVersion) | |
348 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False): | |
349 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds) | |
350 return len(self.cds) > 0 if self.cds else False | |
351 def score(self): | |
352 return self.freq * self.depth if self.freq and self.depth else -1 | |
353 def getCodingSequence(self): | |
354 return self.cds | |
355 def getVariantSequence(self): | |
356 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt: | |
357 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):] | |
358 else: | |
359 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds) | |
360 return None | |
361 def getCodingAminoSequence(self): | |
362 seq = translate(self.cds) if self.cds else None | |
363 if seq: | |
364 self.aa_len = len(seq) | |
365 return seq | |
366 def getVariantAminoSequence(self): | |
367 variant_seq = self.getVariantSequence() | |
368 return translate(variant_seq) if variant_seq else None | |
369 def getVariantPeptide(self,toStopCodon=True): | |
370 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() | |
371 if var_aa: | |
372 if toStopCodon: | |
373 end_pos = var_aa_end | |
374 else: | |
375 end_pos = len(var_aa) | |
376 novel_peptide = var_aa[var_aa_pos:end_pos] | |
377 return novel_peptide | |
378 return None | |
379 # [preAA,varAA,postAA, varPeptide, varOffset, subAA] | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
380 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True): |
0 | 381 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() |
382 if var_aa: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
383 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0 |
0 | 384 if toStopCodon: |
385 end_pos = var_aa_end | |
386 else: | |
1
29b286896c50
Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents:
0
diff
changeset
|
387 end_offset = end_count + 1 |
0 | 388 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end |
389 try: | |
390 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' | |
391 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos) | |
392 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA) | |
393 preAA = var_aa[start_pos:var_aa_pos] # N-term side | |
394 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side | |
395 novel_peptide = var_aa[start_pos:end_pos] | |
396 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation] | |
397 except Exception, e: | |
398 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) ) | |
399 return None | |
400 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop] | |
401 # positions are 0-based array indexes | |
402 def getSequences(self): | |
403 coding_dna = self.getCodingSequence() | |
404 coding_aa = self.getCodingAminoSequence() | |
405 var_dna = self.getVariantSequence() | |
406 var_aa = self.getVariantAminoSequence() | |
407 var_aa_pos = None | |
408 var_aa_end = None | |
409 if var_aa: | |
410 var_aa_pos = self.cds_pos / 3 | |
411 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))): | |
412 if var_aa[j] != coding_aa[j]: | |
413 var_aa_pos = j | |
414 break | |
415 self.aa_pos = var_aa_pos | |
416 var_stop = var_aa.find('*',var_aa_pos) | |
417 if var_stop < 0: | |
418 var_aa_end = len(var_aa) | |
419 else: | |
420 var_aa_end = var_stop + 1 # include the stop codon | |
421 stop_pos = var_stop * 3 | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
422 self.alt_stop_pos = stop_pos |
0 | 423 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3] |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
424 self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \ |
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
425 self.alt_stop_codon + \ |
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
426 ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3 else '') |
0 | 427 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end] |
428 def inHomopolymer(self,polyA_limit): | |
429 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region | |
430 ## check if base changes at cds_pos involve A or T | |
431 bdiff = None # the difference between the cds_ref and cds_alt | |
432 boff = 0 # the offset to the difference | |
433 if len(self.cds_ref) < len(self.cds_alt): | |
434 bdiff = re.sub(self.cds_ref,'',self.cds_alt) | |
435 boff = self.cds_alt.find(bdiff) | |
436 elif len(self.cds_alt) < len(self.cds_ref): | |
437 bdiff = re.sub(self.cds_alt,'',self.cds_ref) | |
438 boff = self.cds_ref.find(bdiff) | |
439 if bdiff: | |
440 ## check the number of similar base neighboring the change | |
441 alt_seq = self.getVariantSequence() | |
442 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))] | |
443 ## adjust match length if the cps_pos is within polyA_limit form sequence end | |
444 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit)) | |
445 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff) | |
446 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times | |
447 if bdiff.find('A') >= 0: | |
448 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
449 match = re.match(pat,subseq) | |
450 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
451 if match: | |
452 return True | |
453 if bdiff.find('T') >= 0: | |
454 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq)) | |
455 match = re.match(pat,subseq) | |
456 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') | |
457 if match: | |
458 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups()) | |
459 return True | |
460 return False | |
461 def getReportHeader(): | |
462 return report_headings | |
463 def getReportFields(self): | |
464 gene_name = self.gene_name if self.gene_name else '' | |
465 cds_ref = self.cds_ref if self.cds_ref else '' | |
466 cds_alt = self.cds_alt if self.cds_alt else '' | |
467 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
468 if debug: print >> sys.stdout, "HGVS %s" % hgvs | |
469 var_aa = self.getVariantPeptide(toStopCodon=True) | |
470 freq = "%2.2f" % self.freq if self.freq else '' | |
471 depth = "%d" % self.depth if self.depth else '' | |
472 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else '' | |
473 aa_len = "%d" % self.aa_len if self.aa_len else '' | |
474 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript | |
475 stop_codon = self.alt_stop_codon if self.alt_stop_codon else '' | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
476 stop_region = self.alt_stop_region if self.alt_stop_region else '' |
0 | 477 chrpos = self.chrPos() |
478 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5] | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
479 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else ''] |
0 | 480 |
481 def __main__(): | |
482 | |
483 def printReportTsv(output_file, snpEffects): | |
484 if output_file: | |
485 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings) | |
486 for snpEffect in snpEffects: | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
487 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields() |
0 | 488 if not snpEffect.effect == 'FRAME_SHIFT': |
489 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
490 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA) | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
491 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide]) |
0 | 492 |
493 """ | |
494 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111 | |
495 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970 | |
496 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774 | |
497 """ | |
498 def printReportHtml(output_file, detail_dir, snpEffects): | |
499 if output_file: | |
500 print >> output_file, '<HTML><BODY>\n' | |
501 print >> output_file, '<TABLE BORDER="1">\n' | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
502 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n' |
0 | 503 for snpEffect in snpEffects: |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
504 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields() |
0 | 505 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html' |
506 aa_ref = '#'.join([refname,aa_anchor]) | |
507 refpath = os.path.join(detail_dir,refname) | |
508 try: | |
509 ref_file = open(refpath,'w') | |
510 printEffDetailHtml(ref_file,snpEffect) | |
511 ref_file.close() | |
512 except Exception, e: | |
513 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) ) | |
514 if snpEffect.effect == 'FRAME_SHIFT': | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
515 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide) |
0 | 516 else: |
517 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) | |
2
15a54fa11ad7
snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents:
1
diff
changeset
|
518 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA) |
0 | 519 print >> output_file, '</TABLE>\n' |
520 print >> output_file, '</BODY></HTML>\n' | |
521 | |
522 def printEffDetailHtml(output_file,snpEffect,wrap_len=60): | |
523 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
524 seq_len = len(coding_seq) | |
525 print >> output_file, '<HTML><BODY>\n' | |
526 print >> output_file, '<TABLE>\n' | |
527 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name | |
528 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt) | |
529 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript) | |
530 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand | |
531 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1)) | |
532 print >> output_file, '</TABLE>\n' | |
533 if alt_aa: | |
534 alt_aa_pos = pos / 3 | |
535 if coding_aa: | |
536 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))): | |
537 if alt_aa[j] != coding_aa[j]: | |
538 alt_aa_pos = j | |
539 break | |
540 alt_aa_end = 0 | |
541 for j in range(alt_aa_pos,len(alt_aa)): | |
542 if alt_aa[j] == '*': | |
543 alt_aa_end = j | |
544 break | |
545 seq = [] | |
546 for j in range(1,wrap_len+1): | |
547 seq.append('-' if j % 10 else '|') | |
548 hrule = '</TD><TD>'.join(seq) | |
549 print >> output_file, '<TABLE cellspacing="0">' | |
550 for i in range(0,seq_len,wrap_len): | |
551 e = min(i + wrap_len,seq_len) | |
552 sa = i/3 | |
553 ea = (i+wrap_len)/3 | |
554 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule) | |
555 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
556 for j in range(i,i + wrap_len): | |
557 if j < len(coding_seq): | |
558 if pos == j: | |
559 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j]) | |
560 elif pos <= j < pos + len(ref): | |
561 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j] | |
562 else: | |
563 print >> output_file, "<TD>%s</TD>" % coding_seq[j] | |
564 else: | |
565 print >> output_file, "<TD></TD>" | |
566 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
567 if coding_aa: | |
568 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
569 for j in range(sa,ea): | |
570 if j < len(coding_aa): | |
571 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j] | |
572 else: | |
573 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
574 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
575 if alt_aa: | |
576 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) | |
577 for j in range(sa,ea): | |
578 if j < len(alt_aa): | |
579 if alt_aa_pos == j: | |
580 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j]) | |
581 elif alt_aa_pos < j <= alt_aa_end: | |
582 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j] | |
583 else: | |
584 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j] | |
585 else: | |
586 print >> output_file, "<TD COLSPAN=\"3\"></TD>" | |
587 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea | |
588 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) | |
589 for j in range(i,i + wrap_len): | |
590 if j < len(alt_seq): | |
591 if pos <= j < pos + len(alt): | |
592 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j] | |
593 else: | |
594 print >> output_file, "<TD>%s</TD>" % alt_seq[j] | |
595 else: | |
596 print >> output_file, "<TD></TD>" | |
597 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e | |
598 print >> output_file, "</TABLE>" | |
599 print >> output_file, "</BODY></HTML>" | |
600 | |
601 def printSeqText(output_file,snpEffect, wrap_len=120): | |
602 nw = 6 | |
603 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() | |
604 if coding_seq: | |
605 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0) | |
606 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|' | |
607 for i in range(0,seq_len,wrap_len): | |
608 e = min(i + wrap_len,seq_len) | |
609 sa = i/3 | |
610 ea = e/3 | |
611 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'') | |
612 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e) | |
613 if coding_aa: | |
614 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea) | |
615 if alt_aa: | |
616 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea) | |
617 if alt_seq: | |
618 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e) | |
619 | |
620 # Parse the command line options | |
621 usage = "Usage: snpEff_cds_report.py filter [options]" | |
622 parser = optparse.OptionParser(usage = usage) | |
623 parser.add_option("-i", "--in", | |
624 action="store", type="string", | |
625 dest="vcfFile", help="input vcf file") | |
626 parser.add_option("-o", "--out", | |
627 action="store", type="string", | |
628 dest="output", help="output report file") | |
629 parser.add_option("-H", "--html_report", | |
630 action="store", type="string", | |
631 dest="html_file", help="html output report file") | |
632 parser.add_option("-D", "--html_dir", | |
633 action="store", type="string", | |
634 dest="html_dir", help="html output report file") | |
635 parser.add_option("-t", "--tsv_file", | |
636 action="store", type="string", | |
637 dest="tsv_file", help="TAB Separated Value (.tsv) output report file") | |
638 parser.add_option("-e", "--ensembl_host", | |
639 action="store", type="string", | |
640 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org') | |
641 parser.add_option("-f", "--effects_filter", | |
642 action="store", type="string", default=None, | |
643 dest="effects_filter", help="a list of effects to filter") | |
644 parser.add_option("-O", "--ensembl_dataset", | |
645 action="store", type="string", | |
646 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl') | |
647 parser.add_option("-p", "--polyA_limit", | |
648 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' ) | |
649 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' ) | |
650 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' ) | |
651 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
652 | |
653 (options, args) = parser.parse_args() | |
654 | |
655 debug = options.debug | |
656 ensembl_host = options.ensembl_host | |
657 | |
658 # Check that a single vcf file is given. | |
659 if options.vcfFile == None: | |
660 parser.print_help() | |
661 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report " | |
662 exit(1) | |
663 outputFile = None | |
664 outputHtml = None | |
665 detailDir = None | |
666 outputTsv = None | |
667 tmpHtmlName = None | |
668 tmpHtml = None | |
669 effects_list = [] | |
670 # Set the output file to stdout if no output file was specified. | |
671 if options.output == None and options.html_file == None and options.tsv_file == None: | |
672 outputFile = sys.stdout | |
673 if options.output != None: | |
674 output = os.path.abspath(options.output) | |
675 outputFile = open(output, 'w') | |
676 if options.tsv_file != None: | |
677 output = os.path.abspath(options.tsv_file) | |
678 outputTsv = open(output, 'w') | |
679 if options.html_file != None: | |
680 output = os.path.abspath(options.html_file) | |
681 outputHtml = open(output, 'w') | |
682 if options.html_dir == None: | |
683 detailDir = os.path.dirname(os.path.abspath(output)) | |
684 else: | |
685 detailDir = options.html_dir | |
686 if detailDir: | |
687 if not os.path.isdir(detailDir): | |
688 os.makedirs(detailDir) | |
689 if options.effects_filter: | |
690 eff_codes = options.effects_filter.split(',') | |
691 for val in eff_codes: | |
692 code = val.strip() | |
693 if code: | |
694 effects_list.append(code) | |
695 | |
696 # | |
697 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
698 try: | |
699 snpEffects = [] | |
700 homopolymers = [] | |
701 polya_count = 0 | |
702 polya_list = [] | |
703 fh = open( options.vcfFile ) | |
704 while True: | |
705 line = fh.readline() | |
706 if not line: | |
707 break #EOF | |
708 if line: | |
709 if outputFile: | |
710 print >> outputFile, "%s\n" % line | |
711 line = line.strip() | |
712 if len(line) < 1: | |
713 continue | |
714 elif line.startswith('#'): | |
715 # check for SnpEffVersion | |
716 """ | |
717 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
718 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
719 """ | |
720 if line.startswith('##SnpEffVersion='): | |
721 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line) | |
722 if m: | |
723 SnpEffVersion = m.group(2) | |
724 # check for SnpEff Info Description | |
725 """ | |
726 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" | |
727 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
728 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" | |
729 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> | |
730 """ | |
731 pass | |
732 else: | |
733 if debug: print >> sys.stdout, "%s\n" % line | |
734 freq = None | |
735 depth = None | |
736 fields = line.split('\t') | |
737 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] | |
738 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) | |
739 if options.debug: print(fields) | |
740 for idx,alt in enumerate(alts.split(',')): | |
741 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts) | |
742 if not re.match('^[ATCG]*$',alt): | |
743 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt) | |
744 for info_item in info.split(';'): | |
745 try: | |
4
429a6b4df5e5
Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
746 if info_item.find('=') < 0: |
429a6b4df5e5
Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents:
2
diff
changeset
|
747 continue |
0 | 748 (key,val) = info_item.split('=',1) |
749 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333; | |
750 freqs = info_item.split(',') | |
751 (k,v) = freqs[idx].split('=',1) | |
752 freq = v | |
753 elif key == 'DP': | |
754 depth = val | |
755 elif key == 'EFF': | |
756 transcript_ids = [] | |
757 effects = val.split(',') | |
758 ccds_dict = None | |
759 for effect in effects: | |
760 (eff,effs) = effect.rstrip(')').split('(') | |
761 eff_fields = effs.split('|') | |
762 if SnpEffVersion == '2': | |
763 """ | |
764 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
765 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
766 """ | |
767 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9] | |
768 else: # SnpEffVersion 3 # adds Amino_Acid_length field | |
769 """ | |
770 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) | |
771 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) | |
772 """ | |
773 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] | |
774 if transcript: | |
775 transcript_ids.append(transcript) | |
776 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids) | |
777 if options.with_ccds: | |
778 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids) | |
779 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict) | |
780 for effect in effects: | |
781 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth) | |
782 snpEffect.transcript_ids = transcript_ids | |
783 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion) | |
784 if effects_list and not snpEffect.effect in effects_list: | |
785 continue | |
786 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict): | |
787 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds): | |
788 if snpEffect.cds_pos: | |
789 if snpEffect.inHomopolymer(options.polyA_limit): | |
790 homopolymers.append(snpEffect) | |
791 else: | |
792 snpEffects.append(snpEffect) | |
793 if outputFile: | |
794 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields()) | |
795 printSeqText(outputFile,snpEffect) | |
796 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects: | |
797 ## Just report the first sequence returned for a vcf line | |
798 break | |
799 except Exception, e: | |
800 sys.stderr.write( "line: %s err: %s\n" % (line,e) ) | |
801 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers) | |
802 # Sort snpEffects | |
803 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True) | |
804 if outputHtml: | |
805 printReportHtml(outputHtml, detailDir, snpEffects) | |
806 if outputTsv: | |
807 printReportTsv(outputTsv,snpEffects) | |
808 except Exception, e: | |
809 print(str(e)) | |
810 | |
811 if __name__ == "__main__": __main__() | |
812 |