comparison snpEff_cds_report.py @ 0:cdbdac66d6b5

Uploaded
author jjohnson
date Tue, 12 Mar 2013 13:59:14 -0400
parents
children 29b286896c50
comparison
equal deleted inserted replaced
-1:000000000000 0:cdbdac66d6b5
1 #!/usr/bin/python
2 import sys,os,tempfile,re
3 import httplib, urllib
4 import optparse
5 #import vcfClass
6 #from vcfClass import *
7 #import tools
8 #from tools import *
9
10 debug = False
11 cds_anchor = 'cds_variation'
12 aa_anchor = 'aa_variation'
13
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
30
31 def reverseComplement(seq) :
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
33 return rev
34
35 def translate(seq) :
36 rna = seq.upper().replace('T','U')
37 aa = []
38 for i in range(0,len(rna) - 2, 3):
39 aa.append(codon_map[rna[i:i+3]])
40 return ''.join(aa)
41
42 """
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
45 """
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
47 <!DOCTYPE Query>
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
51 <Filter name = "with_ccds" excluded = "0"/>
52 <Attribute name = "ensembl_transcript_id" />
53 <Attribute name = "ccds" />
54 </Dataset>
55 </Query>
56 """
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
58 query_template = """<?xml version="1.0" encoding="UTF-8"?>
59 <!DOCTYPE Query>
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
63 <Filter name = "biotype" value = "protein_coding"/>
64 __CCDS_FILTER_HERE__
65 <Attribute name = "cdna" />
66 <Attribute name = "ensembl_gene_id" />
67 <Attribute name = "ensembl_transcript_id" />
68 <Attribute name = "strand" />
69 <Attribute name = "transcript_start" />
70 <Attribute name = "transcript_end"/>
71 <Attribute name = "exon_chrom_start"/>
72 <Attribute name = "exon_chrom_end"/>
73 <Attribute name = "cdna_coding_start" />
74 <Attribute name = "cdna_coding_end" />
75 <Attribute name = "cds_length"/>
76 <Attribute name = "rank"/>
77 <Attribute name = "5_utr_start" />
78 <Attribute name = "5_utr_end" />
79 <Attribute name = "3_utr_start" />
80 <Attribute name = "3_utr_end" />
81 <Attribute name = "ensembl_peptide_id"/>
82 <Attribute name = "start_position" />
83 <Attribute name = "end_position" />
84 </Dataset>
85 </Query>
86 """
87 """
88 Columns(19):
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
90
91 0 cDNA sequence
92 1 Ensembl Gene ID
93 2 Ensembl Transcript ID
94 3 Strand
95 - 4 Transcript Start (bp)
96 - 5 Transcript End (bp)
97 6 Exon Chr Start (bp)
98 7 Exon Chr End (bp)
99 8 CDS Start
100 9 CDS End
101 10 CDS Length
102 - 11 Exon Rank in Transcript
103 12 5' UTR Start
104 13 5' UTR End
105 14 3' UTR Start
106 15 3' UTR End
107 - 16 Ensembl Protein ID
108 - 17 Gene Start (bp)
109 - 18 Gene End (bp)
110 """
111
112 # return {transcript_id : ccds_id}
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
114 ccds_dict = dict()
115 if transcript_ids:
116 query = query_ccds_template
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
119 params = urllib.urlencode({'query':query})
120 headers = {"Accept": "text/plain"}
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
122 try:
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
125 conn.request("POST", "/biomart/martservice", params, headers)
126 response = conn.getresponse()
127 data = response.read()
128 if len(data) > 0:
129 # if debug: print >> sys.stdout, "%s\n\n" % data
130 lines = data.split('\n')
131 for line in lines:
132 fields = line.split('\t')
133 if len(fields) == 2:
134 (transcript_id,ccds) = fields
135 ccds_dict[transcript_id] = ccds
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
137 except Exception, e:
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
139 return ccds_dict
140
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
142 query = query_template
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
146 return query
147
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
150 transcript_id = snpEffect.transcript
151 chr_pos = snpEffect.pos
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
154 params = urllib.urlencode({'query':query})
155 headers = {"Accept": "text/plain"}
156 pos = snpEffect.pos
157 cds_pos = None # zero based offset
158 coding_sequence = ''
159 cds_ref = snpEffect.ref
160 cds_alt = snpEffect.alt
161 try:
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
164 conn.request("POST", "/biomart/martservice", params, headers)
165 response = conn.getresponse()
166 data = response.read()
167 if len(data) > 0:
168 # if debug: print >> sys.stdout, "%s\n\n" % data
169 lines = data.split('\n')
170 # Use the header line to determine the order of fields
171 line = lines[0]
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
173 colnames = line.split('\t')
174 # for i in range(len(colnames)):
175 #
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
177 for line in lines[1:]:
178 if line == None or len(line) < 2:
179 continue
180 field = line.split('\t')
181 if len(field) != len(colnames):
182 continue
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
184 if field[10] == None or len(field[10]) < 1:
185 continue
186 if debug:
187 for i in range(len(colnames)):
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
189 snpEffect.gene_id = field[1]
190 strand = '+' if int(field[3]) > 0 else '-'
191 snpEffect.strand = strand
192 # coding_sequence is first
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
194 if field[10] == None or len(field[10]) < 1:
195 continue
196 cdna_seq = field[0]
197 in_utr = False
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
201 utr_5_lens = []
202 for i in range(len(utr_5_starts)):
203 utr_5_start = int(utr_5_starts[i])
204 utr_5_end = int(utr_5_ends[i])
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
206 if utr_5_start <= pos <= utr_5_end :
207 in_utr = True
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
209 break
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
212 for i in range(len(utr_3_starts)):
213 utr_3_start = int(utr_3_starts[i])
214 utr_3_end = int(utr_3_ends[i])
215 if utr_3_start <= pos <= utr_3_end :
216 in_utr = True
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
218 break
219 # Get coding sequence from cdna
220 cdna_length = int(field[10])
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
223 cdna_lens = []
224 for i in range(len(cdna_coding_start)):
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
228 for i in range(len(cdna_coding_start)):
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
231 snpEffect.cds = coding_sequence
232 if coding_sequence and len(coding_sequence) >= 3:
233 snpEffect.cds_stop_codon = coding_sequence[-3:]
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
237 if not in_utr:
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
241 exon_lens = []
242 for i in range(len(exon_start)):
243 exon_lens.append(exon_end[i] - exon_start[i] + 1)
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
245 if debug:
246 bp_tot = 0
247 for i in range(len(exon_start)):
248 exon_len = exon_end[i] - exon_start[i] + 1
249 bp_tot += exon_len
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
251 cds_idx = 0
252 for i in range(len(exon_start)):
253 # Does this exon have cds bases - i.e. not entirely in UTR
254 if len(utr_5_ends) > 0:
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
256 continue
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
258 continue
259 exon_len = exon_end[i] - exon_start[i] + 1
260 if exon_start[i] <= pos <= exon_end[i]:
261 if strand == '+':
262 if cds_idx: # find offset from start of cdna_coding and exon
263 offset = pos - exon_start[i]
264 else: # find offset from end of cdna_coding and exon
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
267 else: # negative strand
268 cds_ref = reverseComplement(snpEffect.ref)
269 cds_alt = reverseComplement(snpEffect.alt)
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
272 snpEffect.cds_pos = cds_pos
273 snpEffect.cds_ref = cds_ref
274 snpEffect.cds_alt = cds_alt
275 return snpEffect
276 else:
277 cds_idx += 1
278 except Exception, e:
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
280 finally:
281 if conn != None :
282 conn.close()
283 return None
284
285 """
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
288 """
289 class SnpEffect( object ):
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation']
291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
292 self.chrom = chrom
293 self.pos = int(pos)
294 self.ref = ref
295 self.alt = alt
296 self.freq = float(freq) if freq else None
297 self.depth = int(depth) if depth else None
298 # From SnpEffect VCF String
299 self.effect = None
300 self.effect_impact = None
301 self.functional_class = None
302 self.codon_change = None
303 self.amino_acid_change = None
304 self.amino_acid_length = None
305 self.gene_name = None
306 self.gene_id = None
307 self.gene_biotype = None
308 self.coding = None
309 self.transcript = None
310 self.transcript_ids = None
311 self.exon = None
312 self.cds_stop_codon = None
313 # retrieve from ensembl
314 self.strand = None
315 self.cds_pos = None
316 self.cds_ref = None
317 self.cds_alt = None
318 self.cds = None
319 self.aa_pos = None
320 self.aa_len = None
321 self.alt_stop_codon = None
322 def chrPos(self):
323 return "%s:%s" % (self.chrom,self.pos)
324 def setEffect(self, effect, snpEffVersion = None):
325 if snpEffVersion and snpEffVersion == '2':
326 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
327 else: # SnpEffVersion 3 # adds Amino_Acid_length field
328 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
329 self.amino_acid_length = amino_acid_length
330 self.effect_impact = effect_impact
331 self.functional_class = functional_class
332 self.codon_change = codon_change
333 self.amino_acid_change = amino_acid_change
334 self.gene_name = gene_name
335 self.gene_biotype = gene_biotype
336 self.coding = coding
337 self.transcript = transcript
338 self.exon = exon
339 def parseEffect(self, effect, snpEffVersion = None):
340 (eff,effs) = effect.rstrip(')').split('(')
341 self.effect = eff
342 eff_fields = effs.split('|')
343 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
344 self.setEffect(eff_fields, snpEffVersion)
345 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
346 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
347 return len(self.cds) > 0 if self.cds else False
348 def score(self):
349 return self.freq * self.depth if self.freq and self.depth else -1
350 def getCodingSequence(self):
351 return self.cds
352 def getVariantSequence(self):
353 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
354 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
355 else:
356 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
357 return None
358 def getCodingAminoSequence(self):
359 seq = translate(self.cds) if self.cds else None
360 if seq:
361 self.aa_len = len(seq)
362 return seq
363 def getVariantAminoSequence(self):
364 variant_seq = self.getVariantSequence()
365 return translate(variant_seq) if variant_seq else None
366 def getVariantPeptide(self,toStopCodon=True):
367 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
368 if var_aa:
369 if toStopCodon:
370 end_pos = var_aa_end
371 else:
372 end_pos = len(var_aa)
373 novel_peptide = var_aa[var_aa_pos:end_pos]
374 return novel_peptide
375 return None
376 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
377 def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True):
378 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
379 if var_aa:
380 start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0
381 if toStopCodon:
382 end_pos = var_aa_end
383 else:
384 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
385 try:
386 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
387 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
388 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
389 preAA = var_aa[start_pos:var_aa_pos] # N-term side
390 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
391 novel_peptide = var_aa[start_pos:end_pos]
392 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
393 except Exception, e:
394 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
395 return None
396 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
397 # positions are 0-based array indexes
398 def getSequences(self):
399 coding_dna = self.getCodingSequence()
400 coding_aa = self.getCodingAminoSequence()
401 var_dna = self.getVariantSequence()
402 var_aa = self.getVariantAminoSequence()
403 var_aa_pos = None
404 var_aa_end = None
405 if var_aa:
406 var_aa_pos = self.cds_pos / 3
407 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
408 if var_aa[j] != coding_aa[j]:
409 var_aa_pos = j
410 break
411 self.aa_pos = var_aa_pos
412 var_stop = var_aa.find('*',var_aa_pos)
413 if var_stop < 0:
414 var_aa_end = len(var_aa)
415 else:
416 var_aa_end = var_stop + 1 # include the stop codon
417 stop_pos = var_stop * 3
418 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
419 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
420 def inHomopolymer(self,polyA_limit):
421 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
422 ## check if base changes at cds_pos involve A or T
423 bdiff = None # the difference between the cds_ref and cds_alt
424 boff = 0 # the offset to the difference
425 if len(self.cds_ref) < len(self.cds_alt):
426 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
427 boff = self.cds_alt.find(bdiff)
428 elif len(self.cds_alt) < len(self.cds_ref):
429 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
430 boff = self.cds_ref.find(bdiff)
431 if bdiff:
432 ## check the number of similar base neighboring the change
433 alt_seq = self.getVariantSequence()
434 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
435 ## adjust match length if the cps_pos is within polyA_limit form sequence end
436 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
437 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
438 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
439 if bdiff.find('A') >= 0:
440 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
441 match = re.match(pat,subseq)
442 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
443 if match:
444 return True
445 if bdiff.find('T') >= 0:
446 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
447 match = re.match(pat,subseq)
448 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
449 if match:
450 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
451 return True
452 return False
453 def getReportHeader():
454 return report_headings
455 def getReportFields(self):
456 gene_name = self.gene_name if self.gene_name else ''
457 cds_ref = self.cds_ref if self.cds_ref else ''
458 cds_alt = self.cds_alt if self.cds_alt else ''
459 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
460 if debug: print >> sys.stdout, "HGVS %s" % hgvs
461 var_aa = self.getVariantPeptide(toStopCodon=True)
462 freq = "%2.2f" % self.freq if self.freq else ''
463 depth = "%d" % self.depth if self.depth else ''
464 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
465 aa_len = "%d" % self.aa_len if self.aa_len else ''
466 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
467 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
468 chrpos = self.chrPos()
469 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
470 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else '']
471
472 def __main__():
473
474 def printReportTsv(output_file, snpEffects):
475 if output_file:
476 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
477 for snpEffect in snpEffects:
478 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
479 if not snpEffect.effect == 'FRAME_SHIFT':
480 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
481 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
482 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide])
483
484 """
485 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
486 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
487 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
488 """
489 def printReportHtml(output_file, detail_dir, snpEffects):
490 if output_file:
491 print >> output_file, '<HTML><BODY>\n'
492 print >> output_file, '<TABLE BORDER="1">\n'
493 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
494 for snpEffect in snpEffects:
495 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
496 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
497 aa_ref = '#'.join([refname,aa_anchor])
498 refpath = os.path.join(detail_dir,refname)
499 try:
500 ref_file = open(refpath,'w')
501 printEffDetailHtml(ref_file,snpEffect)
502 ref_file.close()
503 except Exception, e:
504 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
505 if snpEffect.effect == 'FRAME_SHIFT':
506 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide)
507 else:
508 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
509 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA)
510 print >> output_file, '</TABLE>\n'
511 print >> output_file, '</BODY></HTML>\n'
512
513 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
514 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
515 seq_len = len(coding_seq)
516 print >> output_file, '<HTML><BODY>\n'
517 print >> output_file, '<TABLE>\n'
518 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
519 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
520 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
521 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
522 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
523 print >> output_file, '</TABLE>\n'
524 if alt_aa:
525 alt_aa_pos = pos / 3
526 if coding_aa:
527 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
528 if alt_aa[j] != coding_aa[j]:
529 alt_aa_pos = j
530 break
531 alt_aa_end = 0
532 for j in range(alt_aa_pos,len(alt_aa)):
533 if alt_aa[j] == '*':
534 alt_aa_end = j
535 break
536 seq = []
537 for j in range(1,wrap_len+1):
538 seq.append('-' if j % 10 else '|')
539 hrule = '</TD><TD>'.join(seq)
540 print >> output_file, '<TABLE cellspacing="0">'
541 for i in range(0,seq_len,wrap_len):
542 e = min(i + wrap_len,seq_len)
543 sa = i/3
544 ea = (i+wrap_len)/3
545 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
546 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
547 for j in range(i,i + wrap_len):
548 if j < len(coding_seq):
549 if pos == j:
550 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
551 elif pos <= j < pos + len(ref):
552 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
553 else:
554 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
555 else:
556 print >> output_file, "<TD></TD>"
557 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
558 if coding_aa:
559 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
560 for j in range(sa,ea):
561 if j < len(coding_aa):
562 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
563 else:
564 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
565 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
566 if alt_aa:
567 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
568 for j in range(sa,ea):
569 if j < len(alt_aa):
570 if alt_aa_pos == j:
571 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
572 elif alt_aa_pos < j <= alt_aa_end:
573 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
574 else:
575 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
576 else:
577 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
578 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
579 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
580 for j in range(i,i + wrap_len):
581 if j < len(alt_seq):
582 if pos <= j < pos + len(alt):
583 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
584 else:
585 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
586 else:
587 print >> output_file, "<TD></TD>"
588 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
589 print >> output_file, "</TABLE>"
590 print >> output_file, "</BODY></HTML>"
591
592 def printSeqText(output_file,snpEffect, wrap_len=120):
593 nw = 6
594 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
595 if coding_seq:
596 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
597 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
598 for i in range(0,seq_len,wrap_len):
599 e = min(i + wrap_len,seq_len)
600 sa = i/3
601 ea = e/3
602 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
603 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
604 if coding_aa:
605 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
606 if alt_aa:
607 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
608 if alt_seq:
609 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
610
611 # Parse the command line options
612 usage = "Usage: snpEff_cds_report.py filter [options]"
613 parser = optparse.OptionParser(usage = usage)
614 parser.add_option("-i", "--in",
615 action="store", type="string",
616 dest="vcfFile", help="input vcf file")
617 parser.add_option("-o", "--out",
618 action="store", type="string",
619 dest="output", help="output report file")
620 parser.add_option("-H", "--html_report",
621 action="store", type="string",
622 dest="html_file", help="html output report file")
623 parser.add_option("-D", "--html_dir",
624 action="store", type="string",
625 dest="html_dir", help="html output report file")
626 parser.add_option("-t", "--tsv_file",
627 action="store", type="string",
628 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
629 parser.add_option("-e", "--ensembl_host",
630 action="store", type="string",
631 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
632 parser.add_option("-f", "--effects_filter",
633 action="store", type="string", default=None,
634 dest="effects_filter", help="a list of effects to filter")
635 parser.add_option("-O", "--ensembl_dataset",
636 action="store", type="string",
637 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
638 parser.add_option("-p", "--polyA_limit",
639 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
640 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
641 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
642 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
643
644 (options, args) = parser.parse_args()
645
646 debug = options.debug
647 ensembl_host = options.ensembl_host
648
649 # Check that a single vcf file is given.
650 if options.vcfFile == None:
651 parser.print_help()
652 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
653 exit(1)
654 outputFile = None
655 outputHtml = None
656 detailDir = None
657 outputTsv = None
658 tmpHtmlName = None
659 tmpHtml = None
660 effects_list = []
661 # Set the output file to stdout if no output file was specified.
662 if options.output == None and options.html_file == None and options.tsv_file == None:
663 outputFile = sys.stdout
664 if options.output != None:
665 output = os.path.abspath(options.output)
666 outputFile = open(output, 'w')
667 if options.tsv_file != None:
668 output = os.path.abspath(options.tsv_file)
669 outputTsv = open(output, 'w')
670 if options.html_file != None:
671 output = os.path.abspath(options.html_file)
672 outputHtml = open(output, 'w')
673 if options.html_dir == None:
674 detailDir = os.path.dirname(os.path.abspath(output))
675 else:
676 detailDir = options.html_dir
677 if detailDir:
678 if not os.path.isdir(detailDir):
679 os.makedirs(detailDir)
680 if options.effects_filter:
681 eff_codes = options.effects_filter.split(',')
682 for val in eff_codes:
683 code = val.strip()
684 if code:
685 effects_list.append(code)
686
687 #
688 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
689 try:
690 snpEffects = []
691 homopolymers = []
692 polya_count = 0
693 polya_list = []
694 fh = open( options.vcfFile )
695 while True:
696 line = fh.readline()
697 if not line:
698 break #EOF
699 if line:
700 if outputFile:
701 print >> outputFile, "%s\n" % line
702 line = line.strip()
703 if len(line) < 1:
704 continue
705 elif line.startswith('#'):
706 # check for SnpEffVersion
707 """
708 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
709 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
710 """
711 if line.startswith('##SnpEffVersion='):
712 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
713 if m:
714 SnpEffVersion = m.group(2)
715 # check for SnpEff Info Description
716 """
717 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
718 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
719 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
720 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
721 """
722 pass
723 else:
724 if debug: print >> sys.stdout, "%s\n" % line
725 freq = None
726 depth = None
727 fields = line.split('\t')
728 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
729 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
730 if options.debug: print(fields)
731 for idx,alt in enumerate(alts.split(',')):
732 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
733 if not re.match('^[ATCG]*$',alt):
734 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
735 for info_item in info.split(';'):
736 try:
737 (key,val) = info_item.split('=',1)
738 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
739 freqs = info_item.split(',')
740 (k,v) = freqs[idx].split('=',1)
741 freq = v
742 elif key == 'DP':
743 depth = val
744 elif key == 'EFF':
745 transcript_ids = []
746 effects = val.split(',')
747 ccds_dict = None
748 for effect in effects:
749 (eff,effs) = effect.rstrip(')').split('(')
750 eff_fields = effs.split('|')
751 if SnpEffVersion == '2':
752 """
753 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
754 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
755 """
756 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
757 else: # SnpEffVersion 3 # adds Amino_Acid_length field
758 """
759 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
760 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
761 """
762 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
763 if transcript:
764 transcript_ids.append(transcript)
765 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
766 if options.with_ccds:
767 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
768 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
769 for effect in effects:
770 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
771 snpEffect.transcript_ids = transcript_ids
772 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
773 if effects_list and not snpEffect.effect in effects_list:
774 continue
775 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
776 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
777 if snpEffect.cds_pos:
778 if snpEffect.inHomopolymer(options.polyA_limit):
779 homopolymers.append(snpEffect)
780 else:
781 snpEffects.append(snpEffect)
782 if outputFile:
783 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
784 printSeqText(outputFile,snpEffect)
785 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
786 ## Just report the first sequence returned for a vcf line
787 break
788 except Exception, e:
789 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
790 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
791 # Sort snpEffects
792 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
793 if outputHtml:
794 printReportHtml(outputHtml, detailDir, snpEffects)
795 if outputTsv:
796 printReportTsv(outputTsv,snpEffects)
797 except Exception, e:
798 print(str(e))
799
800 if __name__ == "__main__": __main__()
801