Mercurial > repos > jjohnson > snpeff_cds_report

diff snpEff_cds_report.py @ 0:cdbdac66d6b5
Uploaded
author: jjohnson
date: Tue, 12 Mar 2013 13:59:14 -0400
children: 29b286896c50
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_cds_report.py	Tue Mar 12 13:59:14 2013 -0400
@@ -0,0 +1,801 @@
+#!/usr/bin/python
+import sys,os,tempfile,re
+import httplib, urllib
+import optparse
+#import vcfClass
+#from vcfClass import *
+#import tools
+#from tools import *
+
+debug = False
+cds_anchor = 'cds_variation'
+aa_anchor = 'aa_variation'
+
+codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
+    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
+    "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
+    "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
+    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
+    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
+    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
+    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
+    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
+    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
+    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
+    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
+    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
+    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
+    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
+    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
+
+def reverseComplement(seq) :
+  rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
+  return rev
+
+def translate(seq) :
+  rna = seq.upper().replace('T','U')
+  aa = []
+  for i in range(0,len(rna) - 2, 3):
+    aa.append(codon_map[rna[i:i+3]])
+  return ''.join(aa)
+
+"""
+SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
+Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
+"""
+query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+	<Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
+		<Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
+		<Filter name = "with_ccds" excluded = "0"/>
+		<Attribute name = "ensembl_transcript_id" />
+		<Attribute name = "ccds" />
+	</Dataset>
+</Query>
+"""
+ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
+query_template = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
+        <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
+                <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
+                <Filter name = "biotype" value = "protein_coding"/>
+                __CCDS_FILTER_HERE__
+                <Attribute name = "cdna" />
+                <Attribute name = "ensembl_gene_id" />
+                <Attribute name = "ensembl_transcript_id" />
+                <Attribute name = "strand" />
+                <Attribute name = "transcript_start" />
+	        <Attribute name = "transcript_end"/>
+	        <Attribute name = "exon_chrom_start"/>
+	        <Attribute name = "exon_chrom_end"/>
+		<Attribute name = "cdna_coding_start" />
+		<Attribute name = "cdna_coding_end" />
+	        <Attribute name = "cds_length"/>
+	        <Attribute name = "rank"/>
+                <Attribute name = "5_utr_start" />
+                <Attribute name = "5_utr_end" />
+                <Attribute name = "3_utr_start" />
+                <Attribute name = "3_utr_end" />
+	        <Attribute name = "ensembl_peptide_id"/>
+		<Attribute name = "start_position" />
+		<Attribute name = "end_position" />
+        </Dataset>
+</Query>
+"""
+"""
+Columns(19):
+['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
+
+  0	cDNA sequence
+  1	Ensembl Gene ID
+  2	Ensembl Transcript ID
+  3	Strand
+- 4	Transcript Start (bp)
+- 5	Transcript End (bp)
+  6	Exon Chr Start (bp)
+  7	Exon Chr End (bp)
+  8	CDS Start
+  9	CDS End
+  10	CDS Length
+- 11	Exon Rank in Transcript
+  12	5' UTR Start
+  13	5' UTR End
+  14	3' UTR Start
+  15	3' UTR End
+- 16	Ensembl Protein ID
+- 17	Gene Start (bp)
+- 18	Gene End (bp)
+"""
+
+# return {transcript_id : ccds_id} 
+def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
+  ccds_dict = dict()
+  if transcript_ids:
+    query = query_ccds_template
+    query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
+    query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
+    params = urllib.urlencode({'query':query})
+    headers = {"Accept": "text/plain"}
+    if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
+    try:
+      if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
+      conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
+      conn.request("POST", "/biomart/martservice", params, headers)
+      response = conn.getresponse()
+      data = response.read()
+      if len(data) > 0:
+        # if debug: print  >> sys.stdout, "%s\n\n" % data
+        lines = data.split('\n')
+        for line in lines:
+          fields = line.split('\t')
+          if len(fields) == 2:
+            (transcript_id,ccds) = fields
+            ccds_dict[transcript_id] = ccds
+        if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
+    except Exception, e:
+      sys.stdout.write( "transcript_ids: %s  %s\n" % (transcript_ids, e) )
+  return ccds_dict
+
+def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
+  query = query_template
+  query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
+  query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
+  query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
+  return query
+
+# return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
+def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
+  transcript_id = snpEffect.transcript
+  chr_pos = snpEffect.pos
+  query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
+  if debug: print  >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
+  params = urllib.urlencode({'query':query})
+  headers = {"Accept": "text/plain"}
+  pos = snpEffect.pos
+  cds_pos = None  # zero based offset
+  coding_sequence = ''
+  cds_ref = snpEffect.ref
+  cds_alt = snpEffect.alt
+  try: 
+    if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
+    conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
+    conn.request("POST", "/biomart/martservice", params, headers)
+    response = conn.getresponse()
+    data = response.read()
+    if len(data) > 0:
+      # if debug: print  >> sys.stdout, "%s\n\n" % data
+      lines = data.split('\n')
+      # Use the header line to determine the order of fields
+      line = lines[0]
+      # if debug: print  >> sys.stdout, "Headers:\n%s\n" % line
+      colnames = line.split('\t')
+      # for i in range(len(colnames)):
+      #
+      if debug: print  >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
+      for line in lines[1:]:
+        if line == None or len(line) < 2: 
+          continue
+        field = line.split('\t')
+        if len(field) != len(colnames):
+          continue
+        if debug: print  >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
+        if field[10] == None or len(field[10]) < 1:
+          continue
+        if debug:
+          for i in range(len(colnames)):
+            print  >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
+        snpEffect.gene_id = field[1]
+        strand = '+' if int(field[3]) > 0 else '-'
+        snpEffect.strand = strand
+        # coding_sequence is first
+        if len(field) > 10 and re.match('[ATGCN]+',field[0]):
+          if field[10] == None or len(field[10]) < 1:
+            continue
+          cdna_seq = field[0]
+          in_utr = False
+          # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
+          utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
+          utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
+          utr_5_lens = []
+          for i in range(len(utr_5_starts)):
+            utr_5_start = int(utr_5_starts[i])
+            utr_5_end = int(utr_5_ends[i])
+            utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
+            if utr_5_start <= pos  <= utr_5_end :
+              in_utr = True
+              if debug: print  >> sys.stdout, "in utr_5: %s     %s     %s\n" % (utr_5_start,pos,utr_5_end);
+              break
+          utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
+          utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
+          for i in range(len(utr_3_starts)):
+            utr_3_start = int(utr_3_starts[i])
+            utr_3_end = int(utr_3_ends[i])
+            if utr_3_start <= pos  <= utr_3_end :
+              in_utr = True
+              if debug: print  >> sys.stdout, "in utr_3: %s     %s     %s\n" % (utr_3_start,pos,utr_3_end);
+              break
+          # Get coding sequence from cdna
+          cdna_length = int(field[10])
+          cdna_coding_start =  sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
+          cdna_coding_end  = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
+          cdna_lens = [] 
+          for i in range(len(cdna_coding_start)):
+            cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
+          if debug: print  >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
+          cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
+          for i in range(len(cdna_coding_start)):
+            if debug: print  >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
+            coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
+          snpEffect.cds = coding_sequence
+          if coding_sequence and len(coding_sequence) >= 3:
+            snpEffect.cds_stop_codon = coding_sequence[-3:]
+          if debug: print  >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
+          if debug: print  >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
+          if debug: print  >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
+          if  not in_utr:
+            exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
+            exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
+            exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
+            exon_lens = [] 
+            for i in range(len(exon_start)):
+              exon_lens.append(exon_end[i] - exon_start[i] + 1)
+            if debug: print  >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
+            if debug: 
+              bp_tot = 0
+              for i in range(len(exon_start)):
+                exon_len = exon_end[i] - exon_start[i] + 1
+                bp_tot += exon_len
+                print >> sys.stdout, "test: %s     %s     %s     %s     %s    %d   %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
+            cds_idx = 0
+            for i in range(len(exon_start)):
+              # Does this exon have cds bases - i.e. not entirely in UTR
+              if len(utr_5_ends) > 0:
+                if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
+                  continue
+                if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
+                  continue
+              exon_len = exon_end[i] - exon_start[i] + 1
+              if exon_start[i] <= pos <= exon_end[i]:
+                if strand == '+':
+                  if cds_idx: # find offset from start of cdna_coding and exon
+                    offset = pos - exon_start[i]
+                  else: # find offset from end of cdna_coding and exon
+                    offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
+                  cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
+                else:  # negative strand
+                  cds_ref = reverseComplement(snpEffect.ref)
+                  cds_alt = reverseComplement(snpEffect.alt)
+                  offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
+                  cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
+                snpEffect.cds_pos = cds_pos
+                snpEffect.cds_ref = cds_ref
+                snpEffect.cds_alt = cds_alt
+                return snpEffect
+              else:  
+                cds_idx += 1
+  except Exception, e:
+    sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
+  finally:
+    if conn != None :
+      conn.close()
+  return None
+
+"""
+  Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+  FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
+"""
+class SnpEffect( object ):
+  report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation']
+  def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
+    self.chrom = chrom
+    self.pos = int(pos)
+    self.ref = ref
+    self.alt = alt
+    self.freq = float(freq) if freq else None
+    self.depth = int(depth) if depth else None
+    # From SnpEffect VCF String
+    self.effect = None
+    self.effect_impact = None
+    self.functional_class = None
+    self.codon_change = None
+    self.amino_acid_change = None
+    self.amino_acid_length = None
+    self.gene_name = None
+    self.gene_id = None
+    self.gene_biotype = None
+    self.coding = None
+    self.transcript = None
+    self.transcript_ids = None
+    self.exon = None
+    self.cds_stop_codon = None
+    # retrieve from ensembl
+    self.strand = None
+    self.cds_pos = None
+    self.cds_ref = None
+    self.cds_alt = None
+    self.cds = None
+    self.aa_pos = None
+    self.aa_len = None
+    self.alt_stop_codon = None
+  def chrPos(self):
+    return "%s:%s" % (self.chrom,self.pos)
+  def setEffect(self, effect, snpEffVersion = None):
+    if snpEffVersion and snpEffVersion == '2':
+      (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
+    else: # SnpEffVersion 3  # adds Amino_Acid_length field
+      (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
+      self.amino_acid_length = amino_acid_length
+    self.effect_impact = effect_impact
+    self.functional_class = functional_class
+    self.codon_change = codon_change
+    self.amino_acid_change = amino_acid_change
+    self.gene_name = gene_name
+    self.gene_biotype = gene_biotype
+    self.coding = coding
+    self.transcript = transcript
+    self.exon = exon
+  def parseEffect(self, effect, snpEffVersion = None):
+    (eff,effs) = effect.rstrip(')').split('(')
+    self.effect = eff
+    eff_fields = effs.split('|')
+    if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
+    self.setEffect(eff_fields, snpEffVersion)
+  def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
+    getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
+    return len(self.cds) > 0 if self.cds else False
+  def score(self):
+    return self.freq * self.depth if self.freq and self.depth else -1
+  def getCodingSequence(self):
+    return self.cds
+  def getVariantSequence(self):
+    if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
+      return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
+    else:
+      if debug: print >> sys.stdout, "getVariantSequence:  %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
+      return None
+  def getCodingAminoSequence(self):
+    seq = translate(self.cds) if self.cds else None
+    if seq:
+      self.aa_len = len(seq)
+    return seq
+  def getVariantAminoSequence(self):
+    variant_seq = self.getVariantSequence()
+    return translate(variant_seq) if variant_seq else None
+  def getVariantPeptide(self,toStopCodon=True):
+    (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
+    if var_aa:
+      if toStopCodon:
+        end_pos = var_aa_end
+      else:
+        end_pos = len(var_aa)
+      novel_peptide = var_aa[var_aa_pos:end_pos]
+      return novel_peptide
+    return None
+  # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
+  def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True):
+    (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
+    if var_aa:
+      start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0
+      if toStopCodon:
+        end_pos = var_aa_end
+      else:
+        end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
+      try:
+        varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' 
+        if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
+        mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
+        preAA = var_aa[start_pos:var_aa_pos] # N-term side
+        postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
+        novel_peptide = var_aa[start_pos:end_pos]
+        return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
+      except Exception, e:
+        sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
+    return None
+  # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
+  # positions are 0-based array indexes 
+  def getSequences(self):
+    coding_dna = self.getCodingSequence()
+    coding_aa = self.getCodingAminoSequence()
+    var_dna = self.getVariantSequence()
+    var_aa = self.getVariantAminoSequence()
+    var_aa_pos = None
+    var_aa_end = None
+    if var_aa:
+      var_aa_pos = self.cds_pos / 3
+      for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
+        if var_aa[j] != coding_aa[j]:
+          var_aa_pos = j
+          break
+      self.aa_pos = var_aa_pos
+      var_stop = var_aa.find('*',var_aa_pos)
+      if var_stop < 0:
+        var_aa_end = len(var_aa)
+      else:
+        var_aa_end = var_stop + 1 # include the stop codon 
+        stop_pos = var_stop * 3
+        self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
+    return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
+  def inHomopolymer(self,polyA_limit):
+    if polyA_limit:   ## library prep can results in inserting or deleting an A in a poly A region
+      ## check if base changes at cds_pos involve A or T 
+      bdiff = None # the difference between the cds_ref and cds_alt
+      boff = 0 # the offset to the difference
+      if len(self.cds_ref) < len(self.cds_alt):
+        bdiff = re.sub(self.cds_ref,'',self.cds_alt)
+        boff = self.cds_alt.find(bdiff)
+      elif len(self.cds_alt) < len(self.cds_ref):
+        bdiff = re.sub(self.cds_alt,'',self.cds_ref)
+        boff = self.cds_ref.find(bdiff)
+      if bdiff:
+        ## check the number of similar base neighboring the change
+        alt_seq = self.getVariantSequence()
+        subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
+        ## adjust match length if the cps_pos is within polyA_limit form sequence end
+        match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
+        if debug: print >> sys.stdout, "polyA bdiff %s   %s  %d  %d\n" % (bdiff,subseq, match_len, boff)
+        ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
+        if bdiff.find('A') >= 0:
+          pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
+          match = re.match(pat,subseq)
+          if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
+          if match:
+            return True
+        if bdiff.find('T') >= 0:
+          pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
+          match = re.match(pat,subseq)
+          if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
+          if match:
+            if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
+            return True
+    return False  
+  def getReportHeader():
+    return report_headings
+  def getReportFields(self):
+    gene_name = self.gene_name  if self.gene_name else ''
+    cds_ref = self.cds_ref if self.cds_ref else ''
+    cds_alt = self.cds_alt if self.cds_alt else ''
+    hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
+    if debug: print >> sys.stdout, "HGVS %s" % hgvs
+    var_aa = self.getVariantPeptide(toStopCodon=True)
+    freq = "%2.2f" % self.freq if self.freq else ''
+    depth = "%d" % self.depth if self.depth else ''
+    aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
+    aa_len = "%d" % self.aa_len if self.aa_len else ''
+    gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
+    stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
+    chrpos = self.chrPos()
+    aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
+    return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else '']
+
+def __main__():
+
+  def printReportTsv(output_file, snpEffects):
+    if output_file:
+      print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
+      for snpEffect in snpEffects:
+        (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
+        if not snpEffect.effect == 'FRAME_SHIFT':
+          (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
+          novel_peptide = "%s_%s_%s" %  (preAA,varAA,postAA)
+        print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide])
+
+  """
+  http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
+  http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
+  http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
+  """
+  def printReportHtml(output_file, detail_dir, snpEffects):
+    if output_file:
+      print >> output_file, '<HTML><BODY>\n'
+      print >> output_file, '<TABLE BORDER="1">\n'
+      print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
+      for snpEffect in snpEffects:
+        (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
+        refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
+        aa_ref = '#'.join([refname,aa_anchor])
+        refpath = os.path.join(detail_dir,refname)
+        try:
+          ref_file = open(refpath,'w')
+          printEffDetailHtml(ref_file,snpEffect)
+          ref_file.close()
+        except Exception, e:
+          sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
+        if snpEffect.effect == 'FRAME_SHIFT':
+          print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide)
+        else:
+          (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
+          print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA)
+      print >> output_file, '</TABLE>\n'
+      print >> output_file, '</BODY></HTML>\n'
+
+  def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
+    (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
+    seq_len = len(coding_seq)
+    print >> output_file, '<HTML><BODY>\n'
+    print >> output_file, '<TABLE>\n'
+    print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
+    print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
+    print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
+    print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
+    print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
+    print >> output_file, '</TABLE>\n'
+    if alt_aa:
+      alt_aa_pos = pos / 3 
+      if coding_aa:
+        for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
+          if alt_aa[j] != coding_aa[j]:
+            alt_aa_pos = j
+            break
+      alt_aa_end = 0
+      for j in range(alt_aa_pos,len(alt_aa)):
+        if alt_aa[j] == '*':
+          alt_aa_end = j
+          break
+    seq = []
+    for j in range(1,wrap_len+1):
+      seq.append('-' if j % 10 else '|')
+    hrule = '</TD><TD>'.join(seq)
+    print >> output_file, '<TABLE cellspacing="0">'
+    for i in range(0,seq_len,wrap_len):
+      e = min(i + wrap_len,seq_len)
+      sa = i/3 
+      ea = (i+wrap_len)/3
+      print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
+      print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
+      for j in range(i,i + wrap_len):
+        if j < len(coding_seq):
+          if pos == j:
+            print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
+          elif pos <= j < pos + len(ref):
+            print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
+          else:
+            print >> output_file, "<TD>%s</TD>" % coding_seq[j]
+        else:
+          print >> output_file, "<TD></TD>"
+      print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
+      if coding_aa:
+        print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
+        for j in range(sa,ea):
+          if j < len(coding_aa):
+            print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
+          else:
+            print >> output_file, "<TD COLSPAN=\"3\"></TD>"
+        print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
+      if alt_aa:
+        print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
+        for j in range(sa,ea):
+          if j < len(alt_aa):
+            if alt_aa_pos == j:
+              print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
+            elif alt_aa_pos < j <= alt_aa_end:
+              print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
+            else:
+              print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
+          else:
+            print >> output_file, "<TD COLSPAN=\"3\"></TD>"
+        print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
+      print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
+      for j in range(i,i + wrap_len):
+        if j < len(alt_seq):
+          if pos <= j < pos + len(alt):
+            print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
+          else:
+            print >> output_file, "<TD>%s</TD>" % alt_seq[j]
+        else:
+          print >> output_file, "<TD></TD>"
+      print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
+    print >> output_file, "</TABLE>"
+    print >> output_file, "</BODY></HTML>"
+
+  def printSeqText(output_file,snpEffect, wrap_len=120):
+    nw = 6
+    (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
+    if coding_seq:
+      seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
+      rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
+      for i in range(0,seq_len,wrap_len):
+        e = min(i + wrap_len,seq_len)
+        sa = i/3 
+        ea = e/3
+        print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
+        print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
+        if coding_aa:
+          print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,'  '.join(coding_aa[sa:ea]),nw,ea)
+        if alt_aa:
+          print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,'  '.join(alt_aa[sa:ea]),nw,ea)
+        if alt_seq:
+          print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
+
+  # Parse the command line options
+  usage = "Usage: snpEff_cds_report.py filter [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="store", type="string",
+                    dest="vcfFile", help="input vcf file")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output report file")
+  parser.add_option("-H", "--html_report",
+                    action="store", type="string",
+                    dest="html_file", help="html output report file")
+  parser.add_option("-D", "--html_dir",
+                    action="store", type="string",
+                    dest="html_dir", help="html output report file")
+  parser.add_option("-t", "--tsv_file",
+                    action="store", type="string",
+                    dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
+  parser.add_option("-e", "--ensembl_host",
+                    action="store", type="string", 
+                    dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
+  parser.add_option("-f", "--effects_filter",
+                    action="store", type="string", default=None,
+                    dest="effects_filter", help="a list of effects to filter")
+  parser.add_option("-O", "--ensembl_dataset",
+                    action="store", type="string", 
+                    dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
+  parser.add_option("-p", "--polyA_limit",
+                    action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
+  parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects'  )
+  parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries'  )
+  parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )
+
+  (options, args) = parser.parse_args()
+
+  debug = options.debug
+  ensembl_host = options.ensembl_host
+
+  # Check that a single vcf file is given.
+  if options.vcfFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
+    exit(1)
+  outputFile = None
+  outputHtml = None
+  detailDir = None
+  outputTsv = None
+  tmpHtmlName = None
+  tmpHtml = None
+  effects_list = []
+  # Set the output file to stdout if no output file was specified.
+  if options.output == None and options.html_file == None and options.tsv_file == None:
+    outputFile = sys.stdout
+  if options.output != None:
+    output = os.path.abspath(options.output)
+    outputFile = open(output, 'w')
+  if options.tsv_file != None:
+    output = os.path.abspath(options.tsv_file)
+    outputTsv = open(output, 'w')
+  if options.html_file != None:
+    output = os.path.abspath(options.html_file)
+    outputHtml = open(output, 'w')
+    if options.html_dir == None:
+      detailDir = os.path.dirname(os.path.abspath(output))
+    else:
+      detailDir = options.html_dir
+  if detailDir:
+    if not os.path.isdir(detailDir):
+      os.makedirs(detailDir)
+  if options.effects_filter:
+    eff_codes = options.effects_filter.split(',')
+    for val in eff_codes:
+      code = val.strip()
+      if code:
+        effects_list.append(code)
+      
+  # 
+  # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+  try:
+    snpEffects = []
+    homopolymers = []
+    polya_count = 0
+    polya_list = []
+    fh = open( options.vcfFile )
+    while True:
+      line = fh.readline()
+      if not line:
+        break #EOF
+      if line:
+        if outputFile:
+          print >> outputFile, "%s\n" % line
+        line = line.strip()
+        if len(line) < 1:
+          continue
+        elif line.startswith('#'):
+          # check for SnpEffVersion
+          """
+          ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
+          ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
+          """
+          if line.startswith('##SnpEffVersion='):
+            m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
+            if m:
+              SnpEffVersion = m.group(2)
+          # check for SnpEff Info Description
+          """
+          ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
+          ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
+          ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
+          ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
+          """
+          pass
+        else:
+          if debug: print >> sys.stdout, "%s\n" % line
+          freq = None
+          depth = None
+          fields = line.split('\t')
+          (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
+          if debug: print  >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) 
+          if options.debug: print(fields)
+          for idx,alt in enumerate(alts.split(',')):
+            if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
+            if not re.match('^[ATCG]*$',alt):
+              print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s  %s\n" % (chrom,pos,alt)
+            for info_item in info.split(';'):
+              try:
+                (key,val) = info_item.split('=',1)  
+                if key == 'SAF':   # Usually a SAF for each variant: if  A,AG   then SAF=0.333333333333333,SAF=0.333333333333333;
+                  freqs = info_item.split(',')
+                  (k,v) = freqs[idx].split('=',1)
+                  freq = v
+                elif key == 'DP':
+                  depth = val
+                elif key == 'EFF':
+                  transcript_ids = []
+                  effects = val.split(',')
+                  ccds_dict = None
+                  for effect in effects:
+                    (eff,effs) = effect.rstrip(')').split('(')
+                    eff_fields = effs.split('|')
+                    if SnpEffVersion == '2':
+                      """
+                      Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+                      EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
+                      """
+                      (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
+                    else: # SnpEffVersion 3  # adds Amino_Acid_length field
+                      """
+                      Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+                      FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
+                      """
+                      (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
+                    if transcript:
+                      transcript_ids.append(transcript)
+                  if debug: print  >> sys.stdout, "transcripts: %s" % (transcript_ids) 
+                  if options.with_ccds:
+                    ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
+                    if debug: print  >> sys.stdout, "ccds_dict: %s" % (ccds_dict) 
+                  for effect in effects:
+                    snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
+                    snpEffect.transcript_ids = transcript_ids
+                    snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
+                    if effects_list and not snpEffect.effect in effects_list:
+                      continue
+                    if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
+                      if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
+                        if snpEffect.cds_pos:
+                          if snpEffect.inHomopolymer(options.polyA_limit):
+                            homopolymers.append(snpEffect)
+                          else:
+                            snpEffects.append(snpEffect)
+                    if outputFile:
+                      print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
+                      printSeqText(outputFile,snpEffect)
+                    if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
+                      ## Just report the first sequence returned for a vcf line
+                      break
+              except Exception, e:
+                sys.stderr.write( "line: %s err: %s\n" % (line,e) )
+    print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
+    # Sort snpEffects
+    snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
+    if outputHtml:
+      printReportHtml(outputHtml, detailDir, snpEffects)
+    if outputTsv:
+      printReportTsv(outputTsv,snpEffects)
+  except Exception, e:
+    print(str(e))         
+
+if __name__ == "__main__": __main__()
+
author	jjohnson
date	Tue, 12 Mar 2013 13:59:14 -0400
parents
children	29b286896c50