changeset 0:cdbdac66d6b5

Uploaded
author jjohnson
date Tue, 12 Mar 2013 13:59:14 -0400
parents
children 29b286896c50
files README snpEff_cds_report.py snpEff_cds_report.xml test-data/snpeff.vcf
diffstat 4 files changed, 1003 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Tue Mar 12 13:59:14 2013 -0400
@@ -0,0 +1,4 @@
+snpeff_cds_report reports the protein coding changes that result from frame shift and non_synonymous variants.  
+It requires the input to be from the MMuFF (Missense Mutation and Frameshift Finder) workflow.  
+Specifically it requires the input from snpEff ( http://snpeff.sourceforge.net/ ) set with an Ensembl genome build,
+which will annotate variants with Ensembl transctipt IDs.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_cds_report.py	Tue Mar 12 13:59:14 2013 -0400
@@ -0,0 +1,801 @@
+#!/usr/bin/python
+import sys,os,tempfile,re
+import httplib, urllib
+import optparse
+#import vcfClass
+#from vcfClass import *
+#import tools
+#from tools import *
+
+debug = False
+cds_anchor = 'cds_variation'
+aa_anchor = 'aa_variation'
+
+codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
+    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
+    "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
+    "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
+    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
+    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
+    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
+    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
+    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
+    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
+    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
+    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
+    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
+    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
+    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
+    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
+
+def reverseComplement(seq) :
+  rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
+  return rev
+
+def translate(seq) :
+  rna = seq.upper().replace('T','U')
+  aa = []
+  for i in range(0,len(rna) - 2, 3):
+    aa.append(codon_map[rna[i:i+3]])
+  return ''.join(aa)
+
+"""
+SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
+Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
+"""
+query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
+	<Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
+		<Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
+		<Filter name = "with_ccds" excluded = "0"/>
+		<Attribute name = "ensembl_transcript_id" />
+		<Attribute name = "ccds" />
+	</Dataset>
+</Query>
+"""
+ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
+query_template = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE Query>
+<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
+        <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
+                <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
+                <Filter name = "biotype" value = "protein_coding"/>
+                __CCDS_FILTER_HERE__
+                <Attribute name = "cdna" />
+                <Attribute name = "ensembl_gene_id" />
+                <Attribute name = "ensembl_transcript_id" />
+                <Attribute name = "strand" />
+                <Attribute name = "transcript_start" />
+	        <Attribute name = "transcript_end"/>
+	        <Attribute name = "exon_chrom_start"/>
+	        <Attribute name = "exon_chrom_end"/>
+		<Attribute name = "cdna_coding_start" />
+		<Attribute name = "cdna_coding_end" />
+	        <Attribute name = "cds_length"/>
+	        <Attribute name = "rank"/>
+                <Attribute name = "5_utr_start" />
+                <Attribute name = "5_utr_end" />
+                <Attribute name = "3_utr_start" />
+                <Attribute name = "3_utr_end" />
+	        <Attribute name = "ensembl_peptide_id"/>
+		<Attribute name = "start_position" />
+		<Attribute name = "end_position" />
+        </Dataset>
+</Query>
+"""
+"""
+Columns(19):
+['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
+
+  0	cDNA sequence
+  1	Ensembl Gene ID
+  2	Ensembl Transcript ID
+  3	Strand
+- 4	Transcript Start (bp)
+- 5	Transcript End (bp)
+  6	Exon Chr Start (bp)
+  7	Exon Chr End (bp)
+  8	CDS Start
+  9	CDS End
+  10	CDS Length
+- 11	Exon Rank in Transcript
+  12	5' UTR Start
+  13	5' UTR End
+  14	3' UTR Start
+  15	3' UTR End
+- 16	Ensembl Protein ID
+- 17	Gene Start (bp)
+- 18	Gene End (bp)
+"""
+
+# return {transcript_id : ccds_id} 
+def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
+  ccds_dict = dict()
+  if transcript_ids:
+    query = query_ccds_template
+    query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
+    query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
+    params = urllib.urlencode({'query':query})
+    headers = {"Accept": "text/plain"}
+    if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
+    try:
+      if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
+      conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
+      conn.request("POST", "/biomart/martservice", params, headers)
+      response = conn.getresponse()
+      data = response.read()
+      if len(data) > 0:
+        # if debug: print  >> sys.stdout, "%s\n\n" % data
+        lines = data.split('\n')
+        for line in lines:
+          fields = line.split('\t')
+          if len(fields) == 2:
+            (transcript_id,ccds) = fields
+            ccds_dict[transcript_id] = ccds
+        if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
+    except Exception, e:
+      sys.stdout.write( "transcript_ids: %s  %s\n" % (transcript_ids, e) )
+  return ccds_dict
+
+def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
+  query = query_template
+  query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
+  query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
+  query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
+  return query
+
+# return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
+def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
+  transcript_id = snpEffect.transcript
+  chr_pos = snpEffect.pos
+  query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
+  if debug: print  >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
+  params = urllib.urlencode({'query':query})
+  headers = {"Accept": "text/plain"}
+  pos = snpEffect.pos
+  cds_pos = None  # zero based offset
+  coding_sequence = ''
+  cds_ref = snpEffect.ref
+  cds_alt = snpEffect.alt
+  try: 
+    if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
+    conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
+    conn.request("POST", "/biomart/martservice", params, headers)
+    response = conn.getresponse()
+    data = response.read()
+    if len(data) > 0:
+      # if debug: print  >> sys.stdout, "%s\n\n" % data
+      lines = data.split('\n')
+      # Use the header line to determine the order of fields
+      line = lines[0]
+      # if debug: print  >> sys.stdout, "Headers:\n%s\n" % line
+      colnames = line.split('\t')
+      # for i in range(len(colnames)):
+      #
+      if debug: print  >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
+      for line in lines[1:]:
+        if line == None or len(line) < 2: 
+          continue
+        field = line.split('\t')
+        if len(field) != len(colnames):
+          continue
+        if debug: print  >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
+        if field[10] == None or len(field[10]) < 1:
+          continue
+        if debug:
+          for i in range(len(colnames)):
+            print  >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
+        snpEffect.gene_id = field[1]
+        strand = '+' if int(field[3]) > 0 else '-'
+        snpEffect.strand = strand
+        # coding_sequence is first
+        if len(field) > 10 and re.match('[ATGCN]+',field[0]):
+          if field[10] == None or len(field[10]) < 1:
+            continue
+          cdna_seq = field[0]
+          in_utr = False
+          # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
+          utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
+          utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
+          utr_5_lens = []
+          for i in range(len(utr_5_starts)):
+            utr_5_start = int(utr_5_starts[i])
+            utr_5_end = int(utr_5_ends[i])
+            utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
+            if utr_5_start <= pos  <= utr_5_end :
+              in_utr = True
+              if debug: print  >> sys.stdout, "in utr_5: %s     %s     %s\n" % (utr_5_start,pos,utr_5_end);
+              break
+          utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
+          utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
+          for i in range(len(utr_3_starts)):
+            utr_3_start = int(utr_3_starts[i])
+            utr_3_end = int(utr_3_ends[i])
+            if utr_3_start <= pos  <= utr_3_end :
+              in_utr = True
+              if debug: print  >> sys.stdout, "in utr_3: %s     %s     %s\n" % (utr_3_start,pos,utr_3_end);
+              break
+          # Get coding sequence from cdna
+          cdna_length = int(field[10])
+          cdna_coding_start =  sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
+          cdna_coding_end  = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
+          cdna_lens = [] 
+          for i in range(len(cdna_coding_start)):
+            cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
+          if debug: print  >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
+          cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
+          for i in range(len(cdna_coding_start)):
+            if debug: print  >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
+            coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
+          snpEffect.cds = coding_sequence
+          if coding_sequence and len(coding_sequence) >= 3:
+            snpEffect.cds_stop_codon = coding_sequence[-3:]
+          if debug: print  >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
+          if debug: print  >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
+          if debug: print  >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
+          if  not in_utr:
+            exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
+            exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
+            exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
+            exon_lens = [] 
+            for i in range(len(exon_start)):
+              exon_lens.append(exon_end[i] - exon_start[i] + 1)
+            if debug: print  >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
+            if debug: 
+              bp_tot = 0
+              for i in range(len(exon_start)):
+                exon_len = exon_end[i] - exon_start[i] + 1
+                bp_tot += exon_len
+                print >> sys.stdout, "test: %s     %s     %s     %s     %s    %d   %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
+            cds_idx = 0
+            for i in range(len(exon_start)):
+              # Does this exon have cds bases - i.e. not entirely in UTR
+              if len(utr_5_ends) > 0:
+                if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
+                  continue
+                if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
+                  continue
+              exon_len = exon_end[i] - exon_start[i] + 1
+              if exon_start[i] <= pos <= exon_end[i]:
+                if strand == '+':
+                  if cds_idx: # find offset from start of cdna_coding and exon
+                    offset = pos - exon_start[i]
+                  else: # find offset from end of cdna_coding and exon
+                    offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
+                  cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
+                else:  # negative strand
+                  cds_ref = reverseComplement(snpEffect.ref)
+                  cds_alt = reverseComplement(snpEffect.alt)
+                  offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
+                  cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
+                snpEffect.cds_pos = cds_pos
+                snpEffect.cds_ref = cds_ref
+                snpEffect.cds_alt = cds_alt
+                return snpEffect
+              else:  
+                cds_idx += 1
+  except Exception, e:
+    sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
+  finally:
+    if conn != None :
+      conn.close()
+  return None
+
+"""
+  Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+  FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
+"""
+class SnpEffect( object ):
+  report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation']
+  def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
+    self.chrom = chrom
+    self.pos = int(pos)
+    self.ref = ref
+    self.alt = alt
+    self.freq = float(freq) if freq else None
+    self.depth = int(depth) if depth else None
+    # From SnpEffect VCF String
+    self.effect = None
+    self.effect_impact = None
+    self.functional_class = None
+    self.codon_change = None
+    self.amino_acid_change = None
+    self.amino_acid_length = None
+    self.gene_name = None
+    self.gene_id = None
+    self.gene_biotype = None
+    self.coding = None
+    self.transcript = None
+    self.transcript_ids = None
+    self.exon = None
+    self.cds_stop_codon = None
+    # retrieve from ensembl
+    self.strand = None
+    self.cds_pos = None
+    self.cds_ref = None
+    self.cds_alt = None
+    self.cds = None
+    self.aa_pos = None
+    self.aa_len = None
+    self.alt_stop_codon = None
+  def chrPos(self):
+    return "%s:%s" % (self.chrom,self.pos)
+  def setEffect(self, effect, snpEffVersion = None):
+    if snpEffVersion and snpEffVersion == '2':
+      (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
+    else: # SnpEffVersion 3  # adds Amino_Acid_length field
+      (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
+      self.amino_acid_length = amino_acid_length
+    self.effect_impact = effect_impact
+    self.functional_class = functional_class
+    self.codon_change = codon_change
+    self.amino_acid_change = amino_acid_change
+    self.gene_name = gene_name
+    self.gene_biotype = gene_biotype
+    self.coding = coding
+    self.transcript = transcript
+    self.exon = exon
+  def parseEffect(self, effect, snpEffVersion = None):
+    (eff,effs) = effect.rstrip(')').split('(')
+    self.effect = eff
+    eff_fields = effs.split('|')
+    if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
+    self.setEffect(eff_fields, snpEffVersion)
+  def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
+    getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
+    return len(self.cds) > 0 if self.cds else False
+  def score(self):
+    return self.freq * self.depth if self.freq and self.depth else -1
+  def getCodingSequence(self):
+    return self.cds
+  def getVariantSequence(self):
+    if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
+      return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
+    else:
+      if debug: print >> sys.stdout, "getVariantSequence:  %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
+      return None
+  def getCodingAminoSequence(self):
+    seq = translate(self.cds) if self.cds else None
+    if seq:
+      self.aa_len = len(seq)
+    return seq
+  def getVariantAminoSequence(self):
+    variant_seq = self.getVariantSequence()
+    return translate(variant_seq) if variant_seq else None
+  def getVariantPeptide(self,toStopCodon=True):
+    (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
+    if var_aa:
+      if toStopCodon:
+        end_pos = var_aa_end
+      else:
+        end_pos = len(var_aa)
+      novel_peptide = var_aa[var_aa_pos:end_pos]
+      return novel_peptide
+    return None
+  # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
+  def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True):
+    (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
+    if var_aa:
+      start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0
+      if toStopCodon:
+        end_pos = var_aa_end
+      else:
+        end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
+      try:
+        varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' 
+        if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
+        mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
+        preAA = var_aa[start_pos:var_aa_pos] # N-term side
+        postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
+        novel_peptide = var_aa[start_pos:end_pos]
+        return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
+      except Exception, e:
+        sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
+    return None
+  # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
+  # positions are 0-based array indexes 
+  def getSequences(self):
+    coding_dna = self.getCodingSequence()
+    coding_aa = self.getCodingAminoSequence()
+    var_dna = self.getVariantSequence()
+    var_aa = self.getVariantAminoSequence()
+    var_aa_pos = None
+    var_aa_end = None
+    if var_aa:
+      var_aa_pos = self.cds_pos / 3
+      for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
+        if var_aa[j] != coding_aa[j]:
+          var_aa_pos = j
+          break
+      self.aa_pos = var_aa_pos
+      var_stop = var_aa.find('*',var_aa_pos)
+      if var_stop < 0:
+        var_aa_end = len(var_aa)
+      else:
+        var_aa_end = var_stop + 1 # include the stop codon 
+        stop_pos = var_stop * 3
+        self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
+    return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
+  def inHomopolymer(self,polyA_limit):
+    if polyA_limit:   ## library prep can results in inserting or deleting an A in a poly A region
+      ## check if base changes at cds_pos involve A or T 
+      bdiff = None # the difference between the cds_ref and cds_alt
+      boff = 0 # the offset to the difference
+      if len(self.cds_ref) < len(self.cds_alt):
+        bdiff = re.sub(self.cds_ref,'',self.cds_alt)
+        boff = self.cds_alt.find(bdiff)
+      elif len(self.cds_alt) < len(self.cds_ref):
+        bdiff = re.sub(self.cds_alt,'',self.cds_ref)
+        boff = self.cds_ref.find(bdiff)
+      if bdiff:
+        ## check the number of similar base neighboring the change
+        alt_seq = self.getVariantSequence()
+        subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
+        ## adjust match length if the cps_pos is within polyA_limit form sequence end
+        match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
+        if debug: print >> sys.stdout, "polyA bdiff %s   %s  %d  %d\n" % (bdiff,subseq, match_len, boff)
+        ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
+        if bdiff.find('A') >= 0:
+          pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
+          match = re.match(pat,subseq)
+          if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
+          if match:
+            return True
+        if bdiff.find('T') >= 0:
+          pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
+          match = re.match(pat,subseq)
+          if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
+          if match:
+            if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
+            return True
+    return False  
+  def getReportHeader():
+    return report_headings
+  def getReportFields(self):
+    gene_name = self.gene_name  if self.gene_name else ''
+    cds_ref = self.cds_ref if self.cds_ref else ''
+    cds_alt = self.cds_alt if self.cds_alt else ''
+    hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
+    if debug: print >> sys.stdout, "HGVS %s" % hgvs
+    var_aa = self.getVariantPeptide(toStopCodon=True)
+    freq = "%2.2f" % self.freq if self.freq else ''
+    depth = "%d" % self.depth if self.depth else ''
+    aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
+    aa_len = "%d" % self.aa_len if self.aa_len else ''
+    gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
+    stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
+    chrpos = self.chrPos()
+    aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
+    return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else '']
+
+def __main__():
+
+  def printReportTsv(output_file, snpEffects):
+    if output_file:
+      print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
+      for snpEffect in snpEffects:
+        (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
+        if not snpEffect.effect == 'FRAME_SHIFT':
+          (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
+          novel_peptide = "%s_%s_%s" %  (preAA,varAA,postAA)
+        print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide])
+
+  """
+  http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
+  http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
+  http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
+  """
+  def printReportHtml(output_file, detail_dir, snpEffects):
+    if output_file:
+      print >> output_file, '<HTML><BODY>\n'
+      print >> output_file, '<TABLE BORDER="1">\n'
+      print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
+      for snpEffect in snpEffects:
+        (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
+        refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
+        aa_ref = '#'.join([refname,aa_anchor])
+        refpath = os.path.join(detail_dir,refname)
+        try:
+          ref_file = open(refpath,'w')
+          printEffDetailHtml(ref_file,snpEffect)
+          ref_file.close()
+        except Exception, e:
+          sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
+        if snpEffect.effect == 'FRAME_SHIFT':
+          print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide)
+        else:
+          (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
+          print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA)
+      print >> output_file, '</TABLE>\n'
+      print >> output_file, '</BODY></HTML>\n'
+
+  def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
+    (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
+    seq_len = len(coding_seq)
+    print >> output_file, '<HTML><BODY>\n'
+    print >> output_file, '<TABLE>\n'
+    print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
+    print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
+    print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
+    print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
+    print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
+    print >> output_file, '</TABLE>\n'
+    if alt_aa:
+      alt_aa_pos = pos / 3 
+      if coding_aa:
+        for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
+          if alt_aa[j] != coding_aa[j]:
+            alt_aa_pos = j
+            break
+      alt_aa_end = 0
+      for j in range(alt_aa_pos,len(alt_aa)):
+        if alt_aa[j] == '*':
+          alt_aa_end = j
+          break
+    seq = []
+    for j in range(1,wrap_len+1):
+      seq.append('-' if j % 10 else '|')
+    hrule = '</TD><TD>'.join(seq)
+    print >> output_file, '<TABLE cellspacing="0">'
+    for i in range(0,seq_len,wrap_len):
+      e = min(i + wrap_len,seq_len)
+      sa = i/3 
+      ea = (i+wrap_len)/3
+      print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
+      print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
+      for j in range(i,i + wrap_len):
+        if j < len(coding_seq):
+          if pos == j:
+            print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
+          elif pos <= j < pos + len(ref):
+            print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
+          else:
+            print >> output_file, "<TD>%s</TD>" % coding_seq[j]
+        else:
+          print >> output_file, "<TD></TD>"
+      print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
+      if coding_aa:
+        print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
+        for j in range(sa,ea):
+          if j < len(coding_aa):
+            print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
+          else:
+            print >> output_file, "<TD COLSPAN=\"3\"></TD>"
+        print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
+      if alt_aa:
+        print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
+        for j in range(sa,ea):
+          if j < len(alt_aa):
+            if alt_aa_pos == j:
+              print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
+            elif alt_aa_pos < j <= alt_aa_end:
+              print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
+            else:
+              print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
+          else:
+            print >> output_file, "<TD COLSPAN=\"3\"></TD>"
+        print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
+      print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
+      for j in range(i,i + wrap_len):
+        if j < len(alt_seq):
+          if pos <= j < pos + len(alt):
+            print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
+          else:
+            print >> output_file, "<TD>%s</TD>" % alt_seq[j]
+        else:
+          print >> output_file, "<TD></TD>"
+      print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
+    print >> output_file, "</TABLE>"
+    print >> output_file, "</BODY></HTML>"
+
+  def printSeqText(output_file,snpEffect, wrap_len=120):
+    nw = 6
+    (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
+    if coding_seq:
+      seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
+      rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
+      for i in range(0,seq_len,wrap_len):
+        e = min(i + wrap_len,seq_len)
+        sa = i/3 
+        ea = e/3
+        print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
+        print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
+        if coding_aa:
+          print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,'  '.join(coding_aa[sa:ea]),nw,ea)
+        if alt_aa:
+          print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,'  '.join(alt_aa[sa:ea]),nw,ea)
+        if alt_seq:
+          print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
+
+  # Parse the command line options
+  usage = "Usage: snpEff_cds_report.py filter [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="store", type="string",
+                    dest="vcfFile", help="input vcf file")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output report file")
+  parser.add_option("-H", "--html_report",
+                    action="store", type="string",
+                    dest="html_file", help="html output report file")
+  parser.add_option("-D", "--html_dir",
+                    action="store", type="string",
+                    dest="html_dir", help="html output report file")
+  parser.add_option("-t", "--tsv_file",
+                    action="store", type="string",
+                    dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
+  parser.add_option("-e", "--ensembl_host",
+                    action="store", type="string", 
+                    dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
+  parser.add_option("-f", "--effects_filter",
+                    action="store", type="string", default=None,
+                    dest="effects_filter", help="a list of effects to filter")
+  parser.add_option("-O", "--ensembl_dataset",
+                    action="store", type="string", 
+                    dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
+  parser.add_option("-p", "--polyA_limit",
+                    action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
+  parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects'  )
+  parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries'  )
+  parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )
+
+  (options, args) = parser.parse_args()
+
+  debug = options.debug
+  ensembl_host = options.ensembl_host
+
+  # Check that a single vcf file is given.
+  if options.vcfFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
+    exit(1)
+  outputFile = None
+  outputHtml = None
+  detailDir = None
+  outputTsv = None
+  tmpHtmlName = None
+  tmpHtml = None
+  effects_list = []
+  # Set the output file to stdout if no output file was specified.
+  if options.output == None and options.html_file == None and options.tsv_file == None:
+    outputFile = sys.stdout
+  if options.output != None:
+    output = os.path.abspath(options.output)
+    outputFile = open(output, 'w')
+  if options.tsv_file != None:
+    output = os.path.abspath(options.tsv_file)
+    outputTsv = open(output, 'w')
+  if options.html_file != None:
+    output = os.path.abspath(options.html_file)
+    outputHtml = open(output, 'w')
+    if options.html_dir == None:
+      detailDir = os.path.dirname(os.path.abspath(output))
+    else:
+      detailDir = options.html_dir
+  if detailDir:
+    if not os.path.isdir(detailDir):
+      os.makedirs(detailDir)
+  if options.effects_filter:
+    eff_codes = options.effects_filter.split(',')
+    for val in eff_codes:
+      code = val.strip()
+      if code:
+        effects_list.append(code)
+      
+  # 
+  # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+  try:
+    snpEffects = []
+    homopolymers = []
+    polya_count = 0
+    polya_list = []
+    fh = open( options.vcfFile )
+    while True:
+      line = fh.readline()
+      if not line:
+        break #EOF
+      if line:
+        if outputFile:
+          print >> outputFile, "%s\n" % line
+        line = line.strip()
+        if len(line) < 1:
+          continue
+        elif line.startswith('#'):
+          # check for SnpEffVersion
+          """
+          ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
+          ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
+          """
+          if line.startswith('##SnpEffVersion='):
+            m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
+            if m:
+              SnpEffVersion = m.group(2)
+          # check for SnpEff Info Description
+          """
+          ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
+          ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
+          ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
+          ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
+          """
+          pass
+        else:
+          if debug: print >> sys.stdout, "%s\n" % line
+          freq = None
+          depth = None
+          fields = line.split('\t')
+          (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
+          if debug: print  >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) 
+          if options.debug: print(fields)
+          for idx,alt in enumerate(alts.split(',')):
+            if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
+            if not re.match('^[ATCG]*$',alt):
+              print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s  %s\n" % (chrom,pos,alt)
+            for info_item in info.split(';'):
+              try:
+                (key,val) = info_item.split('=',1)  
+                if key == 'SAF':   # Usually a SAF for each variant: if  A,AG   then SAF=0.333333333333333,SAF=0.333333333333333;
+                  freqs = info_item.split(',')
+                  (k,v) = freqs[idx].split('=',1)
+                  freq = v
+                elif key == 'DP':
+                  depth = val
+                elif key == 'EFF':
+                  transcript_ids = []
+                  effects = val.split(',')
+                  ccds_dict = None
+                  for effect in effects:
+                    (eff,effs) = effect.rstrip(')').split('(')
+                    eff_fields = effs.split('|')
+                    if SnpEffVersion == '2':
+                      """
+                      Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+                      EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
+                      """
+                      (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
+                    else: # SnpEffVersion 3  # adds Amino_Acid_length field
+                      """
+                      Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
+                      FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
+                      """
+                      (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
+                    if transcript:
+                      transcript_ids.append(transcript)
+                  if debug: print  >> sys.stdout, "transcripts: %s" % (transcript_ids) 
+                  if options.with_ccds:
+                    ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
+                    if debug: print  >> sys.stdout, "ccds_dict: %s" % (ccds_dict) 
+                  for effect in effects:
+                    snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
+                    snpEffect.transcript_ids = transcript_ids
+                    snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
+                    if effects_list and not snpEffect.effect in effects_list:
+                      continue
+                    if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
+                      if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
+                        if snpEffect.cds_pos:
+                          if snpEffect.inHomopolymer(options.polyA_limit):
+                            homopolymers.append(snpEffect)
+                          else:
+                            snpEffects.append(snpEffect)
+                    if outputFile:
+                      print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
+                      printSeqText(outputFile,snpEffect)
+                    if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
+                      ## Just report the first sequence returned for a vcf line
+                      break
+              except Exception, e:
+                sys.stderr.write( "line: %s err: %s\n" % (line,e) )
+    print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
+    # Sort snpEffects
+    snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
+    if outputHtml:
+      printReportHtml(outputHtml, detailDir, snpEffects)
+    if outputTsv:
+      printReportTsv(outputTsv,snpEffects)
+  except Exception, e:
+    print(str(e))         
+
+if __name__ == "__main__": __main__()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_cds_report.xml	Tue Mar 12 13:59:14 2013 -0400
@@ -0,0 +1,191 @@
+<tool id="SnpEff-cds-report" name="SnpEff Ensembl CDS" version="1.0">
+  <description>Report Variant coding sequence changes for SnpEffects</description>
+  <command interpreter="python">
+   snpEff_cds_report.py --in $snp_effect_vcf 
+   #if len($ensembl_host.__str__.strip) > 0:
+     --ensembl_host ${ensembl_host}.archive.ensembl.org
+   #end if
+   #if len($ensembl_dataset.__str__.strip) > 0:
+     --ensembl_dataset $ensembl_dataset
+   #end if
+   #if len($polya.__str__.strip) > 0:
+     --polyA_limit $polya
+   #end if
+   #if $effects_filter
+     --effects_filter=$effects_filter
+   #end if
+   $all_effects
+   $with_ccds
+   #if $report_format.__str__.find('html') >= 0:
+     --html_report $html_report
+     --html_dir $html_report.extra_files_path
+   #end if
+   #if $report_format.__str__.find('tsv') >= 0:
+     --tsv_file $tsv_report
+   #end if
+   #if $report_format.__str__.find('detailed') >= 0:
+     --out $text_report
+   #end if
+  </command>
+  <inputs>
+    <param name="snp_effect_vcf" type="data" format="vcf" label="SnpEffect VCF report" help="VCF output from SnpEffect or SnpSift"/>
+    <param name="ensembl_host" type="text" value="" label="Biomart Server (defaults to www.biomart.org, for an archive server enter the archive date)" help="feb2012 - would access ensembl server: feb2012.archive.ensembl.org">
+      <validator type="regex" message="format: mmmYYYY">^([jfmajsond][aepuco][nbrynlgptvc]20[0-9][0-9])?$</validator>
+    </param>
+    <param name="ensembl_dataset" type="select" label="Oragnism" help="">
+      <option value="hsapiens_gene_ensembl" selected="true">Homo sapiens</option>
+      <option value="mmusculus_gene_ensembl">Mus musculus</option>
+      <option value="rnorvegicus_gene_ensembl">Rattus norvegicus</option>
+      <option value="drerio_gene_ensembl">Danio rerio</option>
+      <option value="ggallus_gene_ensembl">Gallus gallus</option>
+      <option value="amelanoleuca_gene_ensembl">Ailuropoda melanoleuca</option>
+      <option value="acarolinensis_gene_ensembl">Anolis carolinensis</option>
+      <option value="btaurus_gene_ensembl">Bos taurus</option>
+      <option value="celegans_gene_ensembl">Caenorhabditis elegans</option>
+      <option value="cjacchus_gene_ensembl">Callithrix jacchus</option>
+      <option value="cfamiliaris_gene_ensembl">Canis familiaris</option>                        
+      <option value="cporcellus_gene_ensembl">Cavia porcellus</option>
+      <option value="choffmanni_gene_ensembl">Choloepus hoffmanni</option>
+      <option value="cintestinalis_gene_ensembl">Ciona intestinalis</option>
+      <option value="csavignyi_gene_ensembl">Ciona savignyi</option>
+      <option value="dnovemcinctus_gene_ensembl">Dasypus novemcinctus</option>
+      <option value="dordii_gene_ensembl">Dipodomys ordii</option>
+      <option value="dmelanogaster_gene_ensembl">Drosophila melanogaster</option>  
+      <option value="etelfairi_gene_ensembl">Echinops telfairi</option>    
+      <option value="ecaballus_gene_ensembl">Equus caballus</option>                       
+      <option value="eeuropaeus_gene_ensembl">Erinaceus europaeus</option>
+      <option value="fcatus_gene_ensembl">Felis catus</option>
+      <option value="gmorhua_gene_ensembl">Gadus morhua</option>
+      <option value="gaculeatus_gene_ensembl">Gasterosteus aculeatus</option>
+      <option value="ggorilla_gene_ensembl">Gorilla gorilla</option>
+      <option value="itridecemlineatus_gene_ensembl">Ictidomys tridecemlineatus</option> 
+      <option value="lchalumnae_gene_ensembl">Latimeria chalumnae</option>                                
+      <option value="lafricana_gene_ensembl">Loxodonta africana</option>
+      <option value="mmulatta_gene_ensembl">Macaca mulatta</option>
+      <option value="meugenii_gene_ensembl">Macropus eugenii</option>
+      <option value="mgallopavo_gene_ensembl">Meleagris gallopavo</option>
+      <option value="mmurinus_gene_ensembl">Microcebus murinus</option>
+      <option value="mdomestica_gene_ensembl">Monodelphis domestica</option>
+      <option value="mlucifugus_gene_ensembl">Myotis lucifugus</option>
+      <option value="nleucogenys_gene_ensembl">Nomascus leucogenys</option>
+      <option value="oprinceps_gene_ensembl">Ochotona princeps</option>
+      <option value="oniloticus_gene_ensembl">Oreochromis niloticus</option>
+      <option value="oanatinus_gene_ensembl">Ornithorhynchus anatinus</option> 
+      <option value="ocuniculus_gene_ensembl">Oryctolagus cuniculus</option>
+      <option value="olatipes_gene_ensembl">Oryzias latipes</option>
+      <option value="ogarnettii_gene_ensembl">Otolemur garnettii</option>
+      <option value="ptroglodytes_gene_ensembl">Pan troglodytes</option>
+      <option value="psinensis_gene_ensembl">Pelodiscus sinensis</option>
+      <option value="pmarinus_gene_ensembl">Petromyzon marinus</option>
+      <option value="pabelii_gene_ensembl">Pongo abelii</option>
+      <option value="pcapensis_gene_ensembl">Procavia capensis</option>
+      <option value="pvampyrus_gene_ensembl">Pteropus vampyrus</option>
+      <option value="scerevisiae_gene_ensembl">Saccharomyces cerevisiae</option>
+      <option value="sharrisii_gene_ensembl">Sarcophilus harrisii</option>
+      <option value="saraneus_gene_ensembl">Sorex araneus</option>
+      <option value="sscrofa_gene_ensembl">Sus scrofa</option>
+      <option value="tguttata_gene_ensembl">Taeniopygia guttata</option>
+      <option value="trubripes_gene_ensembl">Takifugu rubripes</option>
+      <option value="tsyrichta_gene_ensembl">Tarsius syrichta</option>
+      <option value="tnigroviridis_gene_ensembl">Tetraodon nigroviridis</option>
+      <option value="tbelangeri_gene_ensembl">Tupaia belangeri</option>
+      <option value="ttruncatus_gene_ensembl">Tursiops truncatus</option>
+      <option value="vpacos_gene_ensembl">Vicugna pacos</option>
+      <option value="xtropicalis_gene_ensembl">Xenopus tropicalis</option>
+    </param>
+    <param name="effects_filter" type="select" optional="true" multiple="true" label="Filter SnpEffect" help="Report coding changes for selected effects">
+      <!-- http://snpeff.sourceforge.net/faq.html#What_effects_are_predicted? -->
+      <option value="FRAME_SHIFT">FRAME_SHIFT</option>
+      <option value="NON_SYNONYMOUS_CODING">NON_SYNONYMOUS_CODING</option>
+      <option value="SPLICE_SITE_ACCEPTOR">SPLICE_SITE_ACCEPTOR</option>
+      <option value="SPLICE_SITE_DONOR">SPLICE_SITE_DONOR</option>
+      <option value="START_LOST">START_LOST</option>
+      <option value="EXON_DELETED">EXON_DELETED</option>
+      <option value="STOP_GAINED">STOP_GAINED</option>
+      <option value="STOP_LOST">STOP_LOST</option>
+      <option value="RARE_AMINO_ACID">RARE_AMINO_ACID</option>
+      <option value="CODON_CHANGE">CODON_CHANGE</option>
+      <option value="CODON_INSERTION">CODON_INSERTION</option>
+      <option value="CODON_CHANGE_PLUS_CODON_INSERTION">CODON_CHANGE_PLUS_CODON_INSERTION</option>
+      <option value="CODON_DELETION">CODON_DELETION</option>
+      <option value="CODON_CHANGE_PLUS_CODON_DELETION">CODON_CHANGE_PLUS_CODON_DELETION</option>
+      <option value="UTR_5_DELETED">UTR_5_DELETED</option>
+      <option value="UTR_3_DELETED">UTR_3_DELETED</option>
+      <option value="SYNONYMOUS_START">SYNONYMOUS_START</option>
+      <option value="NON_SYNONYMOUS_START">NON_SYNONYMOUS_START</option>
+      <option value="START_GAINED">START_GAINED</option>
+      <option value="SYNONYMOUS_CODING">SYNONYMOUS_CODING</option>
+      <option value="SYNONYMOUS_STOP">SYNONYMOUS_STOP</option>
+      <option value="UTR_5_PRIME">UTR_5_PRIME</option>
+      <option value="UTR_3_PRIME">UTR_3_PRIME</option>
+      <option value="REGULATION">REGULATION</option>
+      <option value="UPSTREAM">UPSTREAM</option>
+      <option value="DOWNSTREAM">DOWNSTREAM</option>
+      <option value="GENE">GENE</option>
+      <option value="TRANSCRIPT">TRANSCRIPT</option>
+      <option value="EXON">EXON</option>
+      <option value="INTRON_CONSERVED">INTRON_CONSERVED</option>
+      <option value="INTRON">INTRON</option>
+      <option value="INTRAGENIC">INTRAGENIC</option>
+      <option value="INTERGENIC">INTERGENIC</option>
+      <option value="INTERGENIC_CONSERVED">INTERGENIC_CONSERVED</option>
+      <option value="NONE">NONE</option>
+      <option value="CHROMOSOME">CHROMOSOME</option>
+      <option value="CUSTOM">CUSTOM</option>
+      <option value="CDS">CDS</option>
+    </param>
+    <param name="all_effects" type="boolean" truevalue="--all_effects" falsevalue="" checked="false" label="Report the variant coding for each Ensembl Transcript at the variant position" help="Default is to report only the first Transcript that has a cDNA change"/>
+    <param name="with_ccds" type="boolean" truevalue="--with_ccds" falsevalue="" checked="false" label="Report the variant coding only for Ensembl Transcripts with a CCDS ID" help=""/>
+    <param name="polya" type="integer" value="5" optional="true" label="Ignore variants that are part of a Poly-A of at least this length" help="Leave blank to turn off poly A filtering">
+      <validator type="in_range" message="poly A between 2 and 100 bases" min="2" max="100"/>
+    </param>
+    <param name="report_format" type="select" force_select="true" multiple="true" display="checkboxes" label="Report formats" help="">
+      <option value="html" selected="true">Detailed HTML report</option>
+      <option value="tsv" selected="true">TAB-delimited report</option>
+      <option value="detailed">Detailed Text report</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="html" name="html_report" label="Variant Detection Report (html) on ${on_string}">
+      <filter>'html' in report_format</filter>
+    </data>
+    <data format="tabular" name="tsv_report" label="Variant Detection Report (tsv) on ${on_string}">
+      <filter>'tsv' in report_format</filter>
+    </data>
+    <data format="text" name="text_report" label="Variant Detection Report (text) on ${on_string}">
+      <filter>'detailed' in report_format</filter>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="snp_effect_vcf" ftype="vcf" value="snpeff.vcf"/>
+      <param name="ensembl_host" value="feb2012"/>
+      <param name="ensembl_dataset" value="hsapiens_gene_ensembl"/>
+      <param name="effects_filter" value="FRAME_SHIFT"/>
+      <param name="effects_filter" value="FRAME_SHIFT"/>
+      <param name="all_effects" value="false"/>
+      <param name="with_ccds" value="true"/>
+      <param name="polya" value="5"/>
+      <param name="report_format" value="tsv"/>
+      <output name="tsv_report">
+        <assert_contents>
+            <has_text text="46871931" />
+            <has_text text="109461327" />
+            <not_has_text text="54291558" />
+        </assert_contents>
+      </output>
+    </test>
+  </tests>
+ <help>
+**SnpEff Variant Detection Ensembl Report**
+
+Generates the variant sequence and its translation for the variations reported by SnpEffect_. 
+The SnpEffect_ output must be in the VCF format, and must include Ensembl Transcript IDs. SnpSift_ can be used to filter the SnpEffect_ VCF output.
+
+The sequences are retrieved from a biomart server using the xml query interface with the Ensembl Transcript ID as the query key.   
+
+.. _SnpEffect: http://snpeff.sourceforge.net/
+.. _SnpSift: http://snpeff.sourceforge.net/SnpSift.html
+
+ </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/snpeff.vcf	Tue Mar 12 13:59:14 2013 -0400
@@ -0,0 +1,7 @@
+##SnpEffVersion="SnpEff 3.1k (build 2012-12-17), by Pablo Cingolani"
+##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 0 -no downstream,intergenic,intron,upstream,utr -stats /galaxy/PRODUCTION/database/files/000/246/dataset_246290.dat GRCh37.66 /galaxy/PRODUCTION/database/files/000/246/dataset_246288.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
+chr1	46871931	.	GC	G	.	PASS	SAF=0.333333;DP=6;EFF=EXON(MODIFIER|||||FAAH|processed_transcript|CODING|ENST00000489366|2),EXON(MODIFIER|||||FAAH|processed_transcript|CODING|ENST00000493735|6),FRAME_SHIFT(HIGH||-|-281|579|FAAH|protein_coding|CODING|ENST00000243167|7)
+chr1	54291558	.	G	GA	.	PASS	SAF=0.333333;DP=12;EFF=EXON(MODIFIER|||||TMEM48|processed_transcript|CODING|ENST00000480952|4),FRAME_SHIFT(HIGH||-/T|-169?|542|TMEM48|protein_coding|CODING|ENST00000540001|5),FRAME_SHIFT(HIGH||-/T|-169?|557|TMEM48|protein_coding|CODING|ENST00000360494|5),FRAME_SHIFT(HIGH||-/T|-169?|674|TMEM48|protein_coding|CODING|ENST00000371429|5),FRAME_SHIFT(HIGH||-/T|-54?|559|TMEM48|protein_coding|CODING|ENST00000234725|4)
+chr1	109461327	.	G	A,GA	.	PASS	SAF=0.333333,SAF=0.333333;DP=6;EFF=FRAME_SHIFT(HIGH||-/A|-453?|684|GPSM2|protein_coding|CODING|ENST00000264126|12),FRAME_SHIFT(HIGH||-/A|-453?|684|GPSM2|protein_coding|CODING|ENST00000406462|13),SYNONYMOUS_CODING(LOW|SILENT|ggG/ggA|G452|684|GPSM2|protein_coding|CODING|ENST00000264126|),SYNONYMOUS_CODING(LOW|SILENT|ggG/ggA|G452|684|GPSM2|protein_coding|CODING|ENST00000406462|)
+