Mercurial > repos > jjohnson > snpeff_cds_report
view snpEff_cds_report.py @ 0:cdbdac66d6b5
Uploaded
author | jjohnson |
---|---|
date | Tue, 12 Mar 2013 13:59:14 -0400 |
parents | |
children | 29b286896c50 |
line wrap: on
line source
#!/usr/bin/python import sys,os,tempfile,re import httplib, urllib import optparse #import vcfClass #from vcfClass import * #import tools #from tools import * debug = False cds_anchor = 'cds_variation' aa_anchor = 'aa_variation' codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} def reverseComplement(seq) : rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper() return rev def translate(seq) : rna = seq.upper().replace('T','U') aa = [] for i in range(0,len(rna) - 2, 3): aa.append(codon_map[rna[i:i+3]]) return ''.join(aa) """ SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand Queries that request a sequence always return the sequence in the first column regardless of the order of attributes. """ query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> <Filter name = "with_ccds" excluded = "0"/> <Attribute name = "ensembl_transcript_id" /> <Attribute name = "ccds" /> </Dataset> </Query> """ ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>' query_template = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" > <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" > <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/> <Filter name = "biotype" value = "protein_coding"/> __CCDS_FILTER_HERE__ <Attribute name = "cdna" /> <Attribute name = "ensembl_gene_id" /> <Attribute name = "ensembl_transcript_id" /> <Attribute name = "strand" /> <Attribute name = "transcript_start" /> <Attribute name = "transcript_end"/> <Attribute name = "exon_chrom_start"/> <Attribute name = "exon_chrom_end"/> <Attribute name = "cdna_coding_start" /> <Attribute name = "cdna_coding_end" /> <Attribute name = "cds_length"/> <Attribute name = "rank"/> <Attribute name = "5_utr_start" /> <Attribute name = "5_utr_end" /> <Attribute name = "3_utr_start" /> <Attribute name = "3_utr_end" /> <Attribute name = "ensembl_peptide_id"/> <Attribute name = "start_position" /> <Attribute name = "end_position" /> </Dataset> </Query> """ """ Columns(19): ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)'] 0 cDNA sequence 1 Ensembl Gene ID 2 Ensembl Transcript ID 3 Strand - 4 Transcript Start (bp) - 5 Transcript End (bp) 6 Exon Chr Start (bp) 7 Exon Chr End (bp) 8 CDS Start 9 CDS End 10 CDS Length - 11 Exon Rank in Transcript 12 5' UTR Start 13 5' UTR End 14 3' UTR Start 15 3' UTR End - 16 Ensembl Protein ID - 17 Gene Start (bp) - 18 Gene End (bp) """ # return {transcript_id : ccds_id} def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids): ccds_dict = dict() if transcript_ids: query = query_ccds_template query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query) params = urllib.urlencode({'query':query}) headers = {"Accept": "text/plain"} if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query) try: if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') conn.request("POST", "/biomart/martservice", params, headers) response = conn.getresponse() data = response.read() if len(data) > 0: # if debug: print >> sys.stdout, "%s\n\n" % data lines = data.split('\n') for line in lines: fields = line.split('\t') if len(fields) == 2: (transcript_id,ccds) = fields ccds_dict[transcript_id] = ccds if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines) except Exception, e: sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) ) return ccds_dict def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True): query = query_template query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query) query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query) query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query) return query # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence] def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True): transcript_id = snpEffect.transcript chr_pos = snpEffect.pos query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds) if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query) params = urllib.urlencode({'query':query}) headers = {"Accept": "text/plain"} pos = snpEffect.pos cds_pos = None # zero based offset coding_sequence = '' cds_ref = snpEffect.ref cds_alt = snpEffect.alt try: if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host) conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org') conn.request("POST", "/biomart/martservice", params, headers) response = conn.getresponse() data = response.read() if len(data) > 0: # if debug: print >> sys.stdout, "%s\n\n" % data lines = data.split('\n') # Use the header line to determine the order of fields line = lines[0] # if debug: print >> sys.stdout, "Headers:\n%s\n" % line colnames = line.split('\t') # for i in range(len(colnames)): # if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames) for line in lines[1:]: if line == None or len(line) < 2: continue field = line.split('\t') if len(field) != len(colnames): continue if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line) if field[10] == None or len(field[10]) < 1: continue if debug: for i in range(len(colnames)): print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i]) snpEffect.gene_id = field[1] strand = '+' if int(field[3]) > 0 else '-' snpEffect.strand = strand # coding_sequence is first if len(field) > 10 and re.match('[ATGCN]+',field[0]): if field[10] == None or len(field[10]) < 1: continue cdna_seq = field[0] in_utr = False # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-')) utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-')) utr_5_lens = [] for i in range(len(utr_5_starts)): utr_5_start = int(utr_5_starts[i]) utr_5_end = int(utr_5_ends[i]) utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1) if utr_5_start <= pos <= utr_5_end : in_utr = True if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end); break utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-')) utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-')) for i in range(len(utr_3_starts)): utr_3_start = int(utr_3_starts[i]) utr_3_end = int(utr_3_ends[i]) if utr_3_start <= pos <= utr_3_end : in_utr = True if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end); break # Get coding sequence from cdna cdna_length = int(field[10]) cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']')) cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']')) cdna_lens = [] for i in range(len(cdna_coding_start)): cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1) if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet for i in range(len(cdna_coding_start)): if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1] coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]] snpEffect.cds = coding_sequence if coding_sequence and len(coding_sequence) >= 3: snpEffect.cds_stop_codon = coding_sequence[-3:] if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence) if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens) if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens) if not in_utr: exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-')) exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-')) exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-')) exon_lens = [] for i in range(len(exon_start)): exon_lens.append(exon_end[i] - exon_start[i] + 1) if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens) if debug: bp_tot = 0 for i in range(len(exon_start)): exon_len = exon_end[i] - exon_start[i] + 1 bp_tot += exon_len print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot) cds_idx = 0 for i in range(len(exon_start)): # Does this exon have cds bases - i.e. not entirely in UTR if len(utr_5_ends) > 0: if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]: continue if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]: continue exon_len = exon_end[i] - exon_start[i] + 1 if exon_start[i] <= pos <= exon_end[i]: if strand == '+': if cds_idx: # find offset from start of cdna_coding and exon offset = pos - exon_start[i] else: # find offset from end of cdna_coding and exon offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) ) cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset else: # negative strand cds_ref = reverseComplement(snpEffect.ref) cds_alt = reverseComplement(snpEffect.alt) offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1) snpEffect.cds_pos = cds_pos snpEffect.cds_ref = cds_ref snpEffect.cds_alt = cds_alt return snpEffect else: cds_idx += 1 except Exception, e: sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) ) finally: if conn != None : conn.close() return None """ Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) """ class SnpEffect( object ): report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation'] def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False): self.chrom = chrom self.pos = int(pos) self.ref = ref self.alt = alt self.freq = float(freq) if freq else None self.depth = int(depth) if depth else None # From SnpEffect VCF String self.effect = None self.effect_impact = None self.functional_class = None self.codon_change = None self.amino_acid_change = None self.amino_acid_length = None self.gene_name = None self.gene_id = None self.gene_biotype = None self.coding = None self.transcript = None self.transcript_ids = None self.exon = None self.cds_stop_codon = None # retrieve from ensembl self.strand = None self.cds_pos = None self.cds_ref = None self.cds_alt = None self.cds = None self.aa_pos = None self.aa_len = None self.alt_stop_codon = None def chrPos(self): return "%s:%s" % (self.chrom,self.pos) def setEffect(self, effect, snpEffVersion = None): if snpEffVersion and snpEffVersion == '2': (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9] else: # SnpEffVersion 3 # adds Amino_Acid_length field (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10] self.amino_acid_length = amino_acid_length self.effect_impact = effect_impact self.functional_class = functional_class self.codon_change = codon_change self.amino_acid_change = amino_acid_change self.gene_name = gene_name self.gene_biotype = gene_biotype self.coding = coding self.transcript = transcript self.exon = exon def parseEffect(self, effect, snpEffVersion = None): (eff,effs) = effect.rstrip(')').split('(') self.effect = eff eff_fields = effs.split('|') if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields) self.setEffect(eff_fields, snpEffVersion) def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False): getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds) return len(self.cds) > 0 if self.cds else False def score(self): return self.freq * self.depth if self.freq and self.depth else -1 def getCodingSequence(self): return self.cds def getVariantSequence(self): if self.cds and self.cds_pos and self.cds_ref and self.cds_alt: return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):] else: if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds) return None def getCodingAminoSequence(self): seq = translate(self.cds) if self.cds else None if seq: self.aa_len = len(seq) return seq def getVariantAminoSequence(self): variant_seq = self.getVariantSequence() return translate(variant_seq) if variant_seq else None def getVariantPeptide(self,toStopCodon=True): (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() if var_aa: if toStopCodon: end_pos = var_aa_end else: end_pos = len(var_aa) novel_peptide = var_aa[var_aa_pos:end_pos] return novel_peptide return None # [preAA,varAA,postAA, varPeptide, varOffset, subAA] def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True): (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences() if var_aa: start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0 if toStopCodon: end_pos = var_aa_end else: end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end try: varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos) mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA) preAA = var_aa[start_pos:var_aa_pos] # N-term side postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side novel_peptide = var_aa[start_pos:end_pos] return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation] except Exception, e: sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) ) return None # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop] # positions are 0-based array indexes def getSequences(self): coding_dna = self.getCodingSequence() coding_aa = self.getCodingAminoSequence() var_dna = self.getVariantSequence() var_aa = self.getVariantAminoSequence() var_aa_pos = None var_aa_end = None if var_aa: var_aa_pos = self.cds_pos / 3 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))): if var_aa[j] != coding_aa[j]: var_aa_pos = j break self.aa_pos = var_aa_pos var_stop = var_aa.find('*',var_aa_pos) if var_stop < 0: var_aa_end = len(var_aa) else: var_aa_end = var_stop + 1 # include the stop codon stop_pos = var_stop * 3 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3] return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end] def inHomopolymer(self,polyA_limit): if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region ## check if base changes at cds_pos involve A or T bdiff = None # the difference between the cds_ref and cds_alt boff = 0 # the offset to the difference if len(self.cds_ref) < len(self.cds_alt): bdiff = re.sub(self.cds_ref,'',self.cds_alt) boff = self.cds_alt.find(bdiff) elif len(self.cds_alt) < len(self.cds_ref): bdiff = re.sub(self.cds_alt,'',self.cds_ref) boff = self.cds_ref.find(bdiff) if bdiff: ## check the number of similar base neighboring the change alt_seq = self.getVariantSequence() subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))] ## adjust match length if the cps_pos is within polyA_limit form sequence end match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit)) if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff) ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times if bdiff.find('A') >= 0: pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq)) match = re.match(pat,subseq) if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') if match: return True if bdiff.find('T') >= 0: pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq)) match = re.match(pat,subseq) if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match') if match: if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups()) return True return False def getReportHeader(): return report_headings def getReportFields(self): gene_name = self.gene_name if self.gene_name else '' cds_ref = self.cds_ref if self.cds_ref else '' cds_alt = self.cds_alt if self.cds_alt else '' hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False) if debug: print >> sys.stdout, "HGVS %s" % hgvs var_aa = self.getVariantPeptide(toStopCodon=True) freq = "%2.2f" % self.freq if self.freq else '' depth = "%d" % self.depth if self.depth else '' aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else '' aa_len = "%d" % self.aa_len if self.aa_len else '' gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript stop_codon = self.alt_stop_codon if self.alt_stop_codon else '' chrpos = self.chrPos() aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5] return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else ''] def __main__(): def printReportTsv(output_file, snpEffects): if output_file: print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings) for snpEffect in snpEffects: (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields() if not snpEffect.effect == 'FRAME_SHIFT': (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA) print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide]) """ http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774 """ def printReportHtml(output_file, detail_dir, snpEffects): if output_file: print >> output_file, '<HTML><BODY>\n' print >> output_file, '<TABLE BORDER="1">\n' print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n' for snpEffect in snpEffects: (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields() refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html' aa_ref = '#'.join([refname,aa_anchor]) refpath = os.path.join(detail_dir,refname) try: ref_file = open(refpath,'w') printEffDetailHtml(ref_file,snpEffect) ref_file.close() except Exception, e: sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) ) if snpEffect.effect == 'FRAME_SHIFT': print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide) else: (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False) print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA) print >> output_file, '</TABLE>\n' print >> output_file, '</BODY></HTML>\n' def printEffDetailHtml(output_file,snpEffect,wrap_len=60): (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() seq_len = len(coding_seq) print >> output_file, '<HTML><BODY>\n' print >> output_file, '<TABLE>\n' print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt) print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript) print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1)) print >> output_file, '</TABLE>\n' if alt_aa: alt_aa_pos = pos / 3 if coding_aa: for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))): if alt_aa[j] != coding_aa[j]: alt_aa_pos = j break alt_aa_end = 0 for j in range(alt_aa_pos,len(alt_aa)): if alt_aa[j] == '*': alt_aa_end = j break seq = [] for j in range(1,wrap_len+1): seq.append('-' if j % 10 else '|') hrule = '</TD><TD>'.join(seq) print >> output_file, '<TABLE cellspacing="0">' for i in range(0,seq_len,wrap_len): e = min(i + wrap_len,seq_len) sa = i/3 ea = (i+wrap_len)/3 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule) print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) for j in range(i,i + wrap_len): if j < len(coding_seq): if pos == j: print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j]) elif pos <= j < pos + len(ref): print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j] else: print >> output_file, "<TD>%s</TD>" % coding_seq[j] else: print >> output_file, "<TD></TD>" print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e if coding_aa: print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) for j in range(sa,ea): if j < len(coding_aa): print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j] else: print >> output_file, "<TD COLSPAN=\"3\"></TD>" print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea if alt_aa: print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1) for j in range(sa,ea): if j < len(alt_aa): if alt_aa_pos == j: print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j]) elif alt_aa_pos < j <= alt_aa_end: print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j] else: print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j] else: print >> output_file, "<TD COLSPAN=\"3\"></TD>" print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1) for j in range(i,i + wrap_len): if j < len(alt_seq): if pos <= j < pos + len(alt): print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j] else: print >> output_file, "<TD>%s</TD>" % alt_seq[j] else: print >> output_file, "<TD></TD>" print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e print >> output_file, "</TABLE>" print >> output_file, "</BODY></HTML>" def printSeqText(output_file,snpEffect, wrap_len=120): nw = 6 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences() if coding_seq: seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0) rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|' for i in range(0,seq_len,wrap_len): e = min(i + wrap_len,seq_len) sa = i/3 ea = e/3 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'') print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e) if coding_aa: print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea) if alt_aa: print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea) if alt_seq: print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e) # Parse the command line options usage = "Usage: snpEff_cds_report.py filter [options]" parser = optparse.OptionParser(usage = usage) parser.add_option("-i", "--in", action="store", type="string", dest="vcfFile", help="input vcf file") parser.add_option("-o", "--out", action="store", type="string", dest="output", help="output report file") parser.add_option("-H", "--html_report", action="store", type="string", dest="html_file", help="html output report file") parser.add_option("-D", "--html_dir", action="store", type="string", dest="html_dir", help="html output report file") parser.add_option("-t", "--tsv_file", action="store", type="string", dest="tsv_file", help="TAB Separated Value (.tsv) output report file") parser.add_option("-e", "--ensembl_host", action="store", type="string", dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org') parser.add_option("-f", "--effects_filter", action="store", type="string", default=None, dest="effects_filter", help="a list of effects to filter") parser.add_option("-O", "--ensembl_dataset", action="store", type="string", dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl') parser.add_option("-p", "--polyA_limit", action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' ) parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' ) parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' ) parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) (options, args) = parser.parse_args() debug = options.debug ensembl_host = options.ensembl_host # Check that a single vcf file is given. if options.vcfFile == None: parser.print_help() print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report " exit(1) outputFile = None outputHtml = None detailDir = None outputTsv = None tmpHtmlName = None tmpHtml = None effects_list = [] # Set the output file to stdout if no output file was specified. if options.output == None and options.html_file == None and options.tsv_file == None: outputFile = sys.stdout if options.output != None: output = os.path.abspath(options.output) outputFile = open(output, 'w') if options.tsv_file != None: output = os.path.abspath(options.tsv_file) outputTsv = open(output, 'w') if options.html_file != None: output = os.path.abspath(options.html_file) outputHtml = open(output, 'w') if options.html_dir == None: detailDir = os.path.dirname(os.path.abspath(output)) else: detailDir = options.html_dir if detailDir: if not os.path.isdir(detailDir): os.makedirs(detailDir) if options.effects_filter: eff_codes = options.effects_filter.split(',') for val in eff_codes: code = val.strip() if code: effects_list.append(code) # # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) try: snpEffects = [] homopolymers = [] polya_count = 0 polya_list = [] fh = open( options.vcfFile ) while True: line = fh.readline() if not line: break #EOF if line: if outputFile: print >> outputFile, "%s\n" % line line = line.strip() if len(line) < 1: continue elif line.startswith('#'): # check for SnpEffVersion """ ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" """ if line.startswith('##SnpEffVersion='): m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line) if m: SnpEffVersion = m.group(2) # check for SnpEff Info Description """ ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani" ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani" ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> """ pass else: if debug: print >> sys.stdout, "%s\n" % line freq = None depth = None fields = line.split('\t') (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8] if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) if options.debug: print(fields) for idx,alt in enumerate(alts.split(',')): if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts) if not re.match('^[ATCG]*$',alt): print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt) for info_item in info.split(';'): try: (key,val) = info_item.split('=',1) if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333; freqs = info_item.split(',') (k,v) = freqs[idx].split('=',1) freq = v elif key == 'DP': depth = val elif key == 'EFF': transcript_ids = [] effects = val.split(',') ccds_dict = None for effect in effects: (eff,effs) = effect.rstrip(')').split('(') eff_fields = effs.split('|') if SnpEffVersion == '2': """ Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) """ (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9] else: # SnpEffVersion 3 # adds Amino_Acid_length field """ Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] ) FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414) """ (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10] if transcript: transcript_ids.append(transcript) if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids) if options.with_ccds: ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids) if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict) for effect in effects: snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth) snpEffect.transcript_ids = transcript_ids snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion) if effects_list and not snpEffect.effect in effects_list: continue if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict): if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds): if snpEffect.cds_pos: if snpEffect.inHomopolymer(options.polyA_limit): homopolymers.append(snpEffect) else: snpEffects.append(snpEffect) if outputFile: print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields()) printSeqText(outputFile,snpEffect) if snpEffect.cds and snpEffect.cds_pos and not options.all_effects: ## Just report the first sequence returned for a vcf line break except Exception, e: sys.stderr.write( "line: %s err: %s\n" % (line,e) ) print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers) # Sort snpEffects snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True) if outputHtml: printReportHtml(outputHtml, detailDir, snpEffects) if outputTsv: printReportTsv(outputTsv,snpEffects) except Exception, e: print(str(e)) if __name__ == "__main__": __main__()