0
|
1 #!/usr/bin/python
|
|
2 import sys,os,tempfile,re
|
|
3 import httplib, urllib
|
|
4 import optparse
|
|
5 #import vcfClass
|
|
6 #from vcfClass import *
|
|
7 #import tools
|
|
8 #from tools import *
|
|
9
|
|
10 debug = False
|
|
11 cds_anchor = 'cds_variation'
|
|
12 aa_anchor = 'aa_variation'
|
|
13
|
|
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
|
|
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
|
|
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
|
|
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
|
|
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
|
|
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
|
|
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
|
|
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
|
|
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
|
|
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
|
|
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
|
|
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
|
|
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
|
|
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
|
|
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
|
|
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
|
|
30
|
|
31 def reverseComplement(seq) :
|
|
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
|
|
33 return rev
|
|
34
|
|
35 def translate(seq) :
|
|
36 rna = seq.upper().replace('T','U')
|
|
37 aa = []
|
|
38 for i in range(0,len(rna) - 2, 3):
|
|
39 aa.append(codon_map[rna[i:i+3]])
|
|
40 return ''.join(aa)
|
|
41
|
|
42 """
|
|
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
|
|
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
|
|
45 """
|
|
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
|
|
47 <!DOCTYPE Query>
|
|
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
|
|
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
|
|
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
|
|
51 <Filter name = "with_ccds" excluded = "0"/>
|
|
52 <Attribute name = "ensembl_transcript_id" />
|
|
53 <Attribute name = "ccds" />
|
|
54 </Dataset>
|
|
55 </Query>
|
|
56 """
|
|
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
|
|
58 query_template = """<?xml version="1.0" encoding="UTF-8"?>
|
|
59 <!DOCTYPE Query>
|
|
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
|
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
|
|
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
|
|
63 <Filter name = "biotype" value = "protein_coding"/>
|
|
64 __CCDS_FILTER_HERE__
|
|
65 <Attribute name = "cdna" />
|
|
66 <Attribute name = "ensembl_gene_id" />
|
|
67 <Attribute name = "ensembl_transcript_id" />
|
|
68 <Attribute name = "strand" />
|
|
69 <Attribute name = "transcript_start" />
|
|
70 <Attribute name = "transcript_end"/>
|
|
71 <Attribute name = "exon_chrom_start"/>
|
|
72 <Attribute name = "exon_chrom_end"/>
|
|
73 <Attribute name = "cdna_coding_start" />
|
|
74 <Attribute name = "cdna_coding_end" />
|
|
75 <Attribute name = "cds_length"/>
|
|
76 <Attribute name = "rank"/>
|
|
77 <Attribute name = "5_utr_start" />
|
|
78 <Attribute name = "5_utr_end" />
|
|
79 <Attribute name = "3_utr_start" />
|
|
80 <Attribute name = "3_utr_end" />
|
|
81 <Attribute name = "ensembl_peptide_id"/>
|
|
82 <Attribute name = "start_position" />
|
|
83 <Attribute name = "end_position" />
|
|
84 </Dataset>
|
|
85 </Query>
|
|
86 """
|
|
87 """
|
|
88 Columns(19):
|
|
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
|
|
90
|
|
91 0 cDNA sequence
|
|
92 1 Ensembl Gene ID
|
|
93 2 Ensembl Transcript ID
|
|
94 3 Strand
|
|
95 - 4 Transcript Start (bp)
|
|
96 - 5 Transcript End (bp)
|
|
97 6 Exon Chr Start (bp)
|
|
98 7 Exon Chr End (bp)
|
|
99 8 CDS Start
|
|
100 9 CDS End
|
|
101 10 CDS Length
|
|
102 - 11 Exon Rank in Transcript
|
|
103 12 5' UTR Start
|
|
104 13 5' UTR End
|
|
105 14 3' UTR Start
|
|
106 15 3' UTR End
|
|
107 - 16 Ensembl Protein ID
|
|
108 - 17 Gene Start (bp)
|
|
109 - 18 Gene End (bp)
|
|
110 """
|
|
111
|
|
112 # return {transcript_id : ccds_id}
|
|
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
|
|
114 ccds_dict = dict()
|
|
115 if transcript_ids:
|
|
116 query = query_ccds_template
|
|
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
|
|
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
|
|
119 params = urllib.urlencode({'query':query})
|
|
120 headers = {"Accept": "text/plain"}
|
|
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
|
|
122 try:
|
|
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
|
|
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
|
|
125 conn.request("POST", "/biomart/martservice", params, headers)
|
|
126 response = conn.getresponse()
|
|
127 data = response.read()
|
|
128 if len(data) > 0:
|
|
129 # if debug: print >> sys.stdout, "%s\n\n" % data
|
|
130 lines = data.split('\n')
|
|
131 for line in lines:
|
|
132 fields = line.split('\t')
|
|
133 if len(fields) == 2:
|
|
134 (transcript_id,ccds) = fields
|
|
135 ccds_dict[transcript_id] = ccds
|
|
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
|
|
137 except Exception, e:
|
|
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
|
|
139 return ccds_dict
|
|
140
|
|
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
|
|
142 query = query_template
|
|
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
|
|
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
|
|
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
|
|
146 return query
|
|
147
|
|
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
|
|
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
|
|
150 transcript_id = snpEffect.transcript
|
|
151 chr_pos = snpEffect.pos
|
|
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
|
|
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
|
|
154 params = urllib.urlencode({'query':query})
|
|
155 headers = {"Accept": "text/plain"}
|
|
156 pos = snpEffect.pos
|
|
157 cds_pos = None # zero based offset
|
|
158 coding_sequence = ''
|
|
159 cds_ref = snpEffect.ref
|
|
160 cds_alt = snpEffect.alt
|
|
161 try:
|
|
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
|
|
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
|
|
164 conn.request("POST", "/biomart/martservice", params, headers)
|
|
165 response = conn.getresponse()
|
|
166 data = response.read()
|
|
167 if len(data) > 0:
|
|
168 # if debug: print >> sys.stdout, "%s\n\n" % data
|
|
169 lines = data.split('\n')
|
|
170 # Use the header line to determine the order of fields
|
|
171 line = lines[0]
|
|
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
|
|
173 colnames = line.split('\t')
|
|
174 # for i in range(len(colnames)):
|
|
175 #
|
|
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
|
|
177 for line in lines[1:]:
|
|
178 if line == None or len(line) < 2:
|
|
179 continue
|
|
180 field = line.split('\t')
|
|
181 if len(field) != len(colnames):
|
|
182 continue
|
|
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
|
|
184 if field[10] == None or len(field[10]) < 1:
|
|
185 continue
|
|
186 if debug:
|
|
187 for i in range(len(colnames)):
|
|
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
|
|
189 snpEffect.gene_id = field[1]
|
|
190 strand = '+' if int(field[3]) > 0 else '-'
|
|
191 snpEffect.strand = strand
|
|
192 # coding_sequence is first
|
|
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
|
|
194 if field[10] == None or len(field[10]) < 1:
|
|
195 continue
|
|
196 cdna_seq = field[0]
|
|
197 in_utr = False
|
|
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
|
|
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
|
|
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
|
|
201 utr_5_lens = []
|
|
202 for i in range(len(utr_5_starts)):
|
|
203 utr_5_start = int(utr_5_starts[i])
|
|
204 utr_5_end = int(utr_5_ends[i])
|
|
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
|
|
206 if utr_5_start <= pos <= utr_5_end :
|
|
207 in_utr = True
|
|
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
|
|
209 break
|
|
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
|
|
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
|
|
212 for i in range(len(utr_3_starts)):
|
|
213 utr_3_start = int(utr_3_starts[i])
|
|
214 utr_3_end = int(utr_3_ends[i])
|
|
215 if utr_3_start <= pos <= utr_3_end :
|
|
216 in_utr = True
|
|
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
|
|
218 break
|
|
219 # Get coding sequence from cdna
|
|
220 cdna_length = int(field[10])
|
|
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
|
|
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
|
|
223 cdna_lens = []
|
|
224 for i in range(len(cdna_coding_start)):
|
|
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
|
|
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
|
|
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
|
|
228 for i in range(len(cdna_coding_start)):
|
|
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
|
|
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
|
|
231 snpEffect.cds = coding_sequence
|
|
232 if coding_sequence and len(coding_sequence) >= 3:
|
|
233 snpEffect.cds_stop_codon = coding_sequence[-3:]
|
|
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
|
|
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
|
|
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
|
|
237 if not in_utr:
|
|
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
|
|
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
|
|
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
|
|
241 exon_lens = []
|
|
242 for i in range(len(exon_start)):
|
|
243 exon_lens.append(exon_end[i] - exon_start[i] + 1)
|
|
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
|
|
245 if debug:
|
|
246 bp_tot = 0
|
|
247 for i in range(len(exon_start)):
|
|
248 exon_len = exon_end[i] - exon_start[i] + 1
|
|
249 bp_tot += exon_len
|
|
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
|
|
251 cds_idx = 0
|
|
252 for i in range(len(exon_start)):
|
|
253 # Does this exon have cds bases - i.e. not entirely in UTR
|
|
254 if len(utr_5_ends) > 0:
|
|
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
|
|
256 continue
|
|
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
|
|
258 continue
|
|
259 exon_len = exon_end[i] - exon_start[i] + 1
|
|
260 if exon_start[i] <= pos <= exon_end[i]:
|
|
261 if strand == '+':
|
|
262 if cds_idx: # find offset from start of cdna_coding and exon
|
|
263 offset = pos - exon_start[i]
|
|
264 else: # find offset from end of cdna_coding and exon
|
|
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
|
|
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
|
|
267 else: # negative strand
|
|
268 cds_ref = reverseComplement(snpEffect.ref)
|
|
269 cds_alt = reverseComplement(snpEffect.alt)
|
|
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
|
|
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
|
|
272 snpEffect.cds_pos = cds_pos
|
|
273 snpEffect.cds_ref = cds_ref
|
|
274 snpEffect.cds_alt = cds_alt
|
|
275 return snpEffect
|
|
276 else:
|
|
277 cds_idx += 1
|
|
278 except Exception, e:
|
|
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
|
|
280 finally:
|
|
281 if conn != None :
|
|
282 conn.close()
|
|
283 return None
|
|
284
|
|
285 """
|
|
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
|
|
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
|
|
288 """
|
|
289 class SnpEffect( object ):
|
|
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation']
|
|
291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
|
|
292 self.chrom = chrom
|
|
293 self.pos = int(pos)
|
|
294 self.ref = ref
|
|
295 self.alt = alt
|
|
296 self.freq = float(freq) if freq else None
|
|
297 self.depth = int(depth) if depth else None
|
|
298 # From SnpEffect VCF String
|
|
299 self.effect = None
|
|
300 self.effect_impact = None
|
|
301 self.functional_class = None
|
|
302 self.codon_change = None
|
|
303 self.amino_acid_change = None
|
|
304 self.amino_acid_length = None
|
|
305 self.gene_name = None
|
|
306 self.gene_id = None
|
|
307 self.gene_biotype = None
|
|
308 self.coding = None
|
|
309 self.transcript = None
|
|
310 self.transcript_ids = None
|
|
311 self.exon = None
|
|
312 self.cds_stop_codon = None
|
|
313 # retrieve from ensembl
|
|
314 self.strand = None
|
|
315 self.cds_pos = None
|
|
316 self.cds_ref = None
|
|
317 self.cds_alt = None
|
|
318 self.cds = None
|
|
319 self.aa_pos = None
|
|
320 self.aa_len = None
|
|
321 self.alt_stop_codon = None
|
|
322 def chrPos(self):
|
|
323 return "%s:%s" % (self.chrom,self.pos)
|
|
324 def setEffect(self, effect, snpEffVersion = None):
|
|
325 if snpEffVersion and snpEffVersion == '2':
|
|
326 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
|
|
327 else: # SnpEffVersion 3 # adds Amino_Acid_length field
|
|
328 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
|
|
329 self.amino_acid_length = amino_acid_length
|
|
330 self.effect_impact = effect_impact
|
|
331 self.functional_class = functional_class
|
|
332 self.codon_change = codon_change
|
|
333 self.amino_acid_change = amino_acid_change
|
|
334 self.gene_name = gene_name
|
|
335 self.gene_biotype = gene_biotype
|
|
336 self.coding = coding
|
|
337 self.transcript = transcript
|
|
338 self.exon = exon
|
|
339 def parseEffect(self, effect, snpEffVersion = None):
|
|
340 (eff,effs) = effect.rstrip(')').split('(')
|
|
341 self.effect = eff
|
|
342 eff_fields = effs.split('|')
|
|
343 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
|
|
344 self.setEffect(eff_fields, snpEffVersion)
|
|
345 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
|
|
346 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
|
|
347 return len(self.cds) > 0 if self.cds else False
|
|
348 def score(self):
|
|
349 return self.freq * self.depth if self.freq and self.depth else -1
|
|
350 def getCodingSequence(self):
|
|
351 return self.cds
|
|
352 def getVariantSequence(self):
|
|
353 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
|
|
354 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
|
|
355 else:
|
|
356 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
|
|
357 return None
|
|
358 def getCodingAminoSequence(self):
|
|
359 seq = translate(self.cds) if self.cds else None
|
|
360 if seq:
|
|
361 self.aa_len = len(seq)
|
|
362 return seq
|
|
363 def getVariantAminoSequence(self):
|
|
364 variant_seq = self.getVariantSequence()
|
|
365 return translate(variant_seq) if variant_seq else None
|
|
366 def getVariantPeptide(self,toStopCodon=True):
|
|
367 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
|
|
368 if var_aa:
|
|
369 if toStopCodon:
|
|
370 end_pos = var_aa_end
|
|
371 else:
|
|
372 end_pos = len(var_aa)
|
|
373 novel_peptide = var_aa[var_aa_pos:end_pos]
|
|
374 return novel_peptide
|
|
375 return None
|
|
376 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
|
|
377 def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True):
|
|
378 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
|
|
379 if var_aa:
|
|
380 start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0
|
|
381 if toStopCodon:
|
|
382 end_pos = var_aa_end
|
|
383 else:
|
|
384 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
|
|
385 try:
|
|
386 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
|
|
387 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
|
|
388 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
|
|
389 preAA = var_aa[start_pos:var_aa_pos] # N-term side
|
|
390 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
|
|
391 novel_peptide = var_aa[start_pos:end_pos]
|
|
392 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
|
|
393 except Exception, e:
|
|
394 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
|
|
395 return None
|
|
396 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
|
|
397 # positions are 0-based array indexes
|
|
398 def getSequences(self):
|
|
399 coding_dna = self.getCodingSequence()
|
|
400 coding_aa = self.getCodingAminoSequence()
|
|
401 var_dna = self.getVariantSequence()
|
|
402 var_aa = self.getVariantAminoSequence()
|
|
403 var_aa_pos = None
|
|
404 var_aa_end = None
|
|
405 if var_aa:
|
|
406 var_aa_pos = self.cds_pos / 3
|
|
407 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
|
|
408 if var_aa[j] != coding_aa[j]:
|
|
409 var_aa_pos = j
|
|
410 break
|
|
411 self.aa_pos = var_aa_pos
|
|
412 var_stop = var_aa.find('*',var_aa_pos)
|
|
413 if var_stop < 0:
|
|
414 var_aa_end = len(var_aa)
|
|
415 else:
|
|
416 var_aa_end = var_stop + 1 # include the stop codon
|
|
417 stop_pos = var_stop * 3
|
|
418 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
|
|
419 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
|
|
420 def inHomopolymer(self,polyA_limit):
|
|
421 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
|
|
422 ## check if base changes at cds_pos involve A or T
|
|
423 bdiff = None # the difference between the cds_ref and cds_alt
|
|
424 boff = 0 # the offset to the difference
|
|
425 if len(self.cds_ref) < len(self.cds_alt):
|
|
426 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
|
|
427 boff = self.cds_alt.find(bdiff)
|
|
428 elif len(self.cds_alt) < len(self.cds_ref):
|
|
429 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
|
|
430 boff = self.cds_ref.find(bdiff)
|
|
431 if bdiff:
|
|
432 ## check the number of similar base neighboring the change
|
|
433 alt_seq = self.getVariantSequence()
|
|
434 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
|
|
435 ## adjust match length if the cps_pos is within polyA_limit form sequence end
|
|
436 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
|
|
437 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
|
|
438 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
|
|
439 if bdiff.find('A') >= 0:
|
|
440 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
|
|
441 match = re.match(pat,subseq)
|
|
442 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
|
|
443 if match:
|
|
444 return True
|
|
445 if bdiff.find('T') >= 0:
|
|
446 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
|
|
447 match = re.match(pat,subseq)
|
|
448 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
|
|
449 if match:
|
|
450 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
|
|
451 return True
|
|
452 return False
|
|
453 def getReportHeader():
|
|
454 return report_headings
|
|
455 def getReportFields(self):
|
|
456 gene_name = self.gene_name if self.gene_name else ''
|
|
457 cds_ref = self.cds_ref if self.cds_ref else ''
|
|
458 cds_alt = self.cds_alt if self.cds_alt else ''
|
|
459 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
|
|
460 if debug: print >> sys.stdout, "HGVS %s" % hgvs
|
|
461 var_aa = self.getVariantPeptide(toStopCodon=True)
|
|
462 freq = "%2.2f" % self.freq if self.freq else ''
|
|
463 depth = "%d" % self.depth if self.depth else ''
|
|
464 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
|
|
465 aa_len = "%d" % self.aa_len if self.aa_len else ''
|
|
466 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
|
|
467 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
|
|
468 chrpos = self.chrPos()
|
|
469 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
|
|
470 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else '']
|
|
471
|
|
472 def __main__():
|
|
473
|
|
474 def printReportTsv(output_file, snpEffects):
|
|
475 if output_file:
|
|
476 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
|
|
477 for snpEffect in snpEffects:
|
|
478 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
|
|
479 if not snpEffect.effect == 'FRAME_SHIFT':
|
|
480 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
|
|
481 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
|
|
482 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide])
|
|
483
|
|
484 """
|
|
485 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
|
|
486 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
|
|
487 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
|
|
488 """
|
|
489 def printReportHtml(output_file, detail_dir, snpEffects):
|
|
490 if output_file:
|
|
491 print >> output_file, '<HTML><BODY>\n'
|
|
492 print >> output_file, '<TABLE BORDER="1">\n'
|
|
493 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
|
|
494 for snpEffect in snpEffects:
|
|
495 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
|
|
496 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
|
|
497 aa_ref = '#'.join([refname,aa_anchor])
|
|
498 refpath = os.path.join(detail_dir,refname)
|
|
499 try:
|
|
500 ref_file = open(refpath,'w')
|
|
501 printEffDetailHtml(ref_file,snpEffect)
|
|
502 ref_file.close()
|
|
503 except Exception, e:
|
|
504 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
|
|
505 if snpEffect.effect == 'FRAME_SHIFT':
|
|
506 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide)
|
|
507 else:
|
|
508 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
|
|
509 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA)
|
|
510 print >> output_file, '</TABLE>\n'
|
|
511 print >> output_file, '</BODY></HTML>\n'
|
|
512
|
|
513 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
|
|
514 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
|
|
515 seq_len = len(coding_seq)
|
|
516 print >> output_file, '<HTML><BODY>\n'
|
|
517 print >> output_file, '<TABLE>\n'
|
|
518 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
|
|
519 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
|
|
520 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
|
|
521 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
|
|
522 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
|
|
523 print >> output_file, '</TABLE>\n'
|
|
524 if alt_aa:
|
|
525 alt_aa_pos = pos / 3
|
|
526 if coding_aa:
|
|
527 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
|
|
528 if alt_aa[j] != coding_aa[j]:
|
|
529 alt_aa_pos = j
|
|
530 break
|
|
531 alt_aa_end = 0
|
|
532 for j in range(alt_aa_pos,len(alt_aa)):
|
|
533 if alt_aa[j] == '*':
|
|
534 alt_aa_end = j
|
|
535 break
|
|
536 seq = []
|
|
537 for j in range(1,wrap_len+1):
|
|
538 seq.append('-' if j % 10 else '|')
|
|
539 hrule = '</TD><TD>'.join(seq)
|
|
540 print >> output_file, '<TABLE cellspacing="0">'
|
|
541 for i in range(0,seq_len,wrap_len):
|
|
542 e = min(i + wrap_len,seq_len)
|
|
543 sa = i/3
|
|
544 ea = (i+wrap_len)/3
|
|
545 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
|
|
546 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
|
|
547 for j in range(i,i + wrap_len):
|
|
548 if j < len(coding_seq):
|
|
549 if pos == j:
|
|
550 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
|
|
551 elif pos <= j < pos + len(ref):
|
|
552 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
|
|
553 else:
|
|
554 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
|
|
555 else:
|
|
556 print >> output_file, "<TD></TD>"
|
|
557 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
|
|
558 if coding_aa:
|
|
559 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
|
|
560 for j in range(sa,ea):
|
|
561 if j < len(coding_aa):
|
|
562 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
|
|
563 else:
|
|
564 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
|
|
565 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
|
|
566 if alt_aa:
|
|
567 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
|
|
568 for j in range(sa,ea):
|
|
569 if j < len(alt_aa):
|
|
570 if alt_aa_pos == j:
|
|
571 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
|
|
572 elif alt_aa_pos < j <= alt_aa_end:
|
|
573 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
|
|
574 else:
|
|
575 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
|
|
576 else:
|
|
577 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
|
|
578 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
|
|
579 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
|
|
580 for j in range(i,i + wrap_len):
|
|
581 if j < len(alt_seq):
|
|
582 if pos <= j < pos + len(alt):
|
|
583 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
|
|
584 else:
|
|
585 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
|
|
586 else:
|
|
587 print >> output_file, "<TD></TD>"
|
|
588 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
|
|
589 print >> output_file, "</TABLE>"
|
|
590 print >> output_file, "</BODY></HTML>"
|
|
591
|
|
592 def printSeqText(output_file,snpEffect, wrap_len=120):
|
|
593 nw = 6
|
|
594 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
|
|
595 if coding_seq:
|
|
596 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
|
|
597 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
|
|
598 for i in range(0,seq_len,wrap_len):
|
|
599 e = min(i + wrap_len,seq_len)
|
|
600 sa = i/3
|
|
601 ea = e/3
|
|
602 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
|
|
603 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
|
|
604 if coding_aa:
|
|
605 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
|
|
606 if alt_aa:
|
|
607 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
|
|
608 if alt_seq:
|
|
609 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
|
|
610
|
|
611 # Parse the command line options
|
|
612 usage = "Usage: snpEff_cds_report.py filter [options]"
|
|
613 parser = optparse.OptionParser(usage = usage)
|
|
614 parser.add_option("-i", "--in",
|
|
615 action="store", type="string",
|
|
616 dest="vcfFile", help="input vcf file")
|
|
617 parser.add_option("-o", "--out",
|
|
618 action="store", type="string",
|
|
619 dest="output", help="output report file")
|
|
620 parser.add_option("-H", "--html_report",
|
|
621 action="store", type="string",
|
|
622 dest="html_file", help="html output report file")
|
|
623 parser.add_option("-D", "--html_dir",
|
|
624 action="store", type="string",
|
|
625 dest="html_dir", help="html output report file")
|
|
626 parser.add_option("-t", "--tsv_file",
|
|
627 action="store", type="string",
|
|
628 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
|
|
629 parser.add_option("-e", "--ensembl_host",
|
|
630 action="store", type="string",
|
|
631 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
|
|
632 parser.add_option("-f", "--effects_filter",
|
|
633 action="store", type="string", default=None,
|
|
634 dest="effects_filter", help="a list of effects to filter")
|
|
635 parser.add_option("-O", "--ensembl_dataset",
|
|
636 action="store", type="string",
|
|
637 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
|
|
638 parser.add_option("-p", "--polyA_limit",
|
|
639 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
|
|
640 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
|
|
641 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
|
|
642 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
|
|
643
|
|
644 (options, args) = parser.parse_args()
|
|
645
|
|
646 debug = options.debug
|
|
647 ensembl_host = options.ensembl_host
|
|
648
|
|
649 # Check that a single vcf file is given.
|
|
650 if options.vcfFile == None:
|
|
651 parser.print_help()
|
|
652 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
|
|
653 exit(1)
|
|
654 outputFile = None
|
|
655 outputHtml = None
|
|
656 detailDir = None
|
|
657 outputTsv = None
|
|
658 tmpHtmlName = None
|
|
659 tmpHtml = None
|
|
660 effects_list = []
|
|
661 # Set the output file to stdout if no output file was specified.
|
|
662 if options.output == None and options.html_file == None and options.tsv_file == None:
|
|
663 outputFile = sys.stdout
|
|
664 if options.output != None:
|
|
665 output = os.path.abspath(options.output)
|
|
666 outputFile = open(output, 'w')
|
|
667 if options.tsv_file != None:
|
|
668 output = os.path.abspath(options.tsv_file)
|
|
669 outputTsv = open(output, 'w')
|
|
670 if options.html_file != None:
|
|
671 output = os.path.abspath(options.html_file)
|
|
672 outputHtml = open(output, 'w')
|
|
673 if options.html_dir == None:
|
|
674 detailDir = os.path.dirname(os.path.abspath(output))
|
|
675 else:
|
|
676 detailDir = options.html_dir
|
|
677 if detailDir:
|
|
678 if not os.path.isdir(detailDir):
|
|
679 os.makedirs(detailDir)
|
|
680 if options.effects_filter:
|
|
681 eff_codes = options.effects_filter.split(',')
|
|
682 for val in eff_codes:
|
|
683 code = val.strip()
|
|
684 if code:
|
|
685 effects_list.append(code)
|
|
686
|
|
687 #
|
|
688 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
|
|
689 try:
|
|
690 snpEffects = []
|
|
691 homopolymers = []
|
|
692 polya_count = 0
|
|
693 polya_list = []
|
|
694 fh = open( options.vcfFile )
|
|
695 while True:
|
|
696 line = fh.readline()
|
|
697 if not line:
|
|
698 break #EOF
|
|
699 if line:
|
|
700 if outputFile:
|
|
701 print >> outputFile, "%s\n" % line
|
|
702 line = line.strip()
|
|
703 if len(line) < 1:
|
|
704 continue
|
|
705 elif line.startswith('#'):
|
|
706 # check for SnpEffVersion
|
|
707 """
|
|
708 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
|
|
709 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
|
|
710 """
|
|
711 if line.startswith('##SnpEffVersion='):
|
|
712 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
|
|
713 if m:
|
|
714 SnpEffVersion = m.group(2)
|
|
715 # check for SnpEff Info Description
|
|
716 """
|
|
717 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
|
|
718 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
|
|
719 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
|
|
720 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
|
|
721 """
|
|
722 pass
|
|
723 else:
|
|
724 if debug: print >> sys.stdout, "%s\n" % line
|
|
725 freq = None
|
|
726 depth = None
|
|
727 fields = line.split('\t')
|
|
728 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
|
|
729 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
|
|
730 if options.debug: print(fields)
|
|
731 for idx,alt in enumerate(alts.split(',')):
|
|
732 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
|
|
733 if not re.match('^[ATCG]*$',alt):
|
|
734 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
|
|
735 for info_item in info.split(';'):
|
|
736 try:
|
|
737 (key,val) = info_item.split('=',1)
|
|
738 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
|
|
739 freqs = info_item.split(',')
|
|
740 (k,v) = freqs[idx].split('=',1)
|
|
741 freq = v
|
|
742 elif key == 'DP':
|
|
743 depth = val
|
|
744 elif key == 'EFF':
|
|
745 transcript_ids = []
|
|
746 effects = val.split(',')
|
|
747 ccds_dict = None
|
|
748 for effect in effects:
|
|
749 (eff,effs) = effect.rstrip(')').split('(')
|
|
750 eff_fields = effs.split('|')
|
|
751 if SnpEffVersion == '2':
|
|
752 """
|
|
753 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
|
|
754 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
|
|
755 """
|
|
756 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
|
|
757 else: # SnpEffVersion 3 # adds Amino_Acid_length field
|
|
758 """
|
|
759 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
|
|
760 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
|
|
761 """
|
|
762 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
|
|
763 if transcript:
|
|
764 transcript_ids.append(transcript)
|
|
765 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
|
|
766 if options.with_ccds:
|
|
767 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
|
|
768 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
|
|
769 for effect in effects:
|
|
770 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
|
|
771 snpEffect.transcript_ids = transcript_ids
|
|
772 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
|
|
773 if effects_list and not snpEffect.effect in effects_list:
|
|
774 continue
|
|
775 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
|
|
776 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
|
|
777 if snpEffect.cds_pos:
|
|
778 if snpEffect.inHomopolymer(options.polyA_limit):
|
|
779 homopolymers.append(snpEffect)
|
|
780 else:
|
|
781 snpEffects.append(snpEffect)
|
|
782 if outputFile:
|
|
783 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
|
|
784 printSeqText(outputFile,snpEffect)
|
|
785 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
|
|
786 ## Just report the first sequence returned for a vcf line
|
|
787 break
|
|
788 except Exception, e:
|
|
789 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
|
|
790 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
|
|
791 # Sort snpEffects
|
|
792 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
|
|
793 if outputHtml:
|
|
794 printReportHtml(outputHtml, detailDir, snpEffects)
|
|
795 if outputTsv:
|
|
796 printReportTsv(outputTsv,snpEffects)
|
|
797 except Exception, e:
|
|
798 print(str(e))
|
|
799
|
|
800 if __name__ == "__main__": __main__()
|
|
801
|