view pileup_to_vcf.py @ 4:fafa105e5f58

Fix base quality
author Jim Johnson <jj@umn.edu>
date Tue, 19 Mar 2013 13:04:40 -0500
parents aa76c8dd97e6
children d90f56ecd2f9
line wrap: on
line source

#!/usr/bin/env python
"""
#
#------------------------------------------------------------------------------
#                         University of Minnesota
#         Copyright 2012, Regents of the University of Minnesota
#------------------------------------------------------------------------------
# Author:
#
#  James E Johnson
#  Jesse Erdmann
#
#------------------------------------------------------------------------------
"""

"""
Generate a VCF file from a samtools pileup
filtering on read coverage, base call quality, and allele frequency

Pileup Format
http://samtools.sourceforge.net/pileup.shtml
Columns:
chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, base qualities

VCF Format
http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
The header line names the 8 fixed, mandatory columns. These columns are as follows:
CHROM POS ID REF ALT QUAL FILTER INFO
"""

import sys,re,os.path
import optparse
from optparse import OptionParser

vcf_header =  """\
##fileformat=VCFv4.0
##source=pileup_to_vcf.pyV1.0
##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">
##INFO=<ID=$SAF_header,Number=.,Type=Float,Description=\"Specific Allele Frequency\">
##FILTER=<ID=DP,Description=\"Minimum depth of %s\">
##FILTER=<ID=$SAF_header,Description=\"Allele frequency of at least %s with base quality minimum %d\">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO\
"""

def __main__():
  #Parse Command Line
  parser = optparse.OptionParser()
  # files
  parser.add_option( '-i', '--input', dest='input', help='The input pileup file (else read from stdin)' )
  parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' )
  # filters
  parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' )
  parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' )
  parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' )
  parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' )
  parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' )
  parser.add_option( '-s', '--snps_only', action="store_true", dest='snps_only', default=False, help='Only report SNPs, not indels' )
  # select columns
  parser.add_option( '-C', '--chrom_col', type='int', default='1', dest='chrom_col', help='The ordinal position (starting with 1) of the chromosome column' )
  parser.add_option( '-P', '--pos_col', type='int', default='2', dest='pos_col', help='The ordinal position (starting with 1) of the position column' )
  parser.add_option( '-R', '--ref_col', type='int', default='3', dest='ref_col', help='The ordinal position (starting with 1) of the reference base  column' )
  parser.add_option( '-D', '--coverage_col', type='int', default='4', dest='coverage_col', help='The ordinal position (starting with 1) of the read coverage depth column' )
  parser.add_option( '-B', '--base_call_col', type='int', default='5', dest='base_call_col', help='The ordinal position (starting with 1) of the base call column' )
  parser.add_option( '-Q', '--base_qual_col', type='int', default='6', dest='base_qual_col', help='The ordinal position (starting with 1) of the base quality column' )
  # debug
  parser.add_option( '-d', '--debug', action="store_true", dest='debug', default=False, help='Print Debug messages' )
  # 
  (options, args) = parser.parse_args()

  debug = options.debug if options.debug else False

  #Coverage must have at least a total depth of min_coverage to be considered
  min_coverage = options.min_coverage
  #Count depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality
  dp_as = options.report_depth
  #Base qualities must be at least the min_base_quality value to be counted
  min_base_quality = options.min_base_quality
  #Coverage must be more than this pct in support of the variant, e.g. variant support / total coverage > min_allele_freq
  min_allele_freq = options.min_allele_freq
  #Allow multiple variants to be reported for a position, min_allele_freq must be less than 0.5 for this to have effect.
  allow_multiples = options.allow_multiples
  # Whether to report indels 
  report_indels = not options.snps_only

  # Check for 1-based column options, and subtract 1 for python array index
  #Which column (count starting with 1) contains the chromosome (Default assumes 6-col pileup or 12-col filtered pileup)
  chrom_col = options.chrom_col - 1
  #Which column (count starting with 1) contains the position (Default assumes 6-col pileup or 12-col filtered pileup)
  pos_col = options.pos_col - 1
  #Which column (count starting with 1) contains the reference base (Default assumes 6-col pileup or 12-col filtered pileup)
  ref_col = options.ref_col - 1
  #Which column (count starting with 1) contains the total coverage for the position (Default assumes 6-col pileup or 12-col filtered pileup)
  coverage_col = options.coverage_col - 1
  #Which column (count starting with 1) contains the base calls for the position (Default assumes 6-col pileup or 12-col filtered pileup)
  base_call_col = options.base_call_col - 1
  #Which column (count starting with 1) contains the base call qualities for the position (Default assumes 6-col pileup or 12-col filtered pileup)
  base_qual_col = options.base_qual_col - 1

  # pileup input 
  if options.input != None:
    try:
      inputPath = os.path.abspath(options.input)
      inputFile = open(inputPath, 'r')
    except Exception, e:
      print >> sys.stderr, "failed: %s" % e
      exit(2)
  else:
    inputFile = sys.stdin
  # vcf output 
  if options.output != None:
    try:
      outputPath = os.path.abspath(options.output)
      outputFile = open(outputPath, 'w')
    except Exception, e:
      print >> sys.stderr, "failed: %s" % e
      exit(3)
  else:
    outputFile = sys.stdout

  indel_len_pattern = '([1-9][0-9]*)'
  ref_skip_pattern = '[<>]'

  SAF_header = "SAF";

  print >> outputFile, vcf_header % (min_coverage,min_allele_freq,min_base_quality)

  try:  
    for linenum,line in enumerate(inputFile):
      if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line)
      fields = line.split('\t')
      sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column
      # Quick check for coverage
      if dp_as != 'all' and sdp < min_coverage :  
        continue
      bases = fields[base_call_col]
      m = re.findall(ref_skip_pattern,bases) 
      # Quick check for coverage, pileup coverage - reference skips
      rdp = sdp - (len(m) if m else 0)
      if dp_as == 'ref'  or dp_as == 'qual':
        if rdp < min_coverage:
          continue
      quals = fields[base_qual_col]
      chrom = fields[chrom_col]
      pos = int(fields[pos_col])
      ref = fields[ref_col]
      deletions = dict()
      max_deletion = 0
      insertions = dict()
      snps = {'A':0,'C':0,'G':0,'T':0,'N':0}
      bi = 0	# bases index
      qi = 0	# quals index
      adp = 0	# read coverage depth
      qdp = 0	# read coverage depth with base call quality passing minimum
      variants = dict() # variant -> count
      mc = 0 # match count
      while bi < len(bases):
        base = bases[bi] 
        # if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33)
        bi += 1
        if base in '<>' :	#reference skip (between paired reads)
          if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
          qi += 1
        elif base in '.,' : # match reference on forward/reverse strand
          if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
          mc += 1
          adp += 1
          qual = ord(quals[qi]) -33
          qi += 1
          # count match 
          if qual >= min_base_quality:
            qdp += 1
        elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand
          if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
          adp += 1
          qual = ord(quals[qi]) -33
          qi += 1
          # record snp variant
          if qual >= min_base_quality:
            qdp += 1
            snps[base.upper()] += 1
        elif base in '+-' : # indel 
          adp += 1
          m = re.match(indel_len_pattern,bases[bi:]) 
          indel_len_str = m.group(0) # get the indel len string
          bi += len(indel_len_str) # increment the index by the len of the len string
          indel_len = int(indel_len_str)
          indel = bases[bi:bi+indel_len].upper() # get the indel bases
          if debug: print >> sys.stderr, "%3d\t%s%s" % (bi-2,base,indel)
          bi += indel_len # increment the index by the len of the indel 
          if base == '+':
            # record insertion variant
            if indel in insertions: 
              insertions[indel] += 1
            else:
              insertions[indel] = 1
          elif base == '-': 
            # record deletion variant
            max_deletion = max(indel_len,max_deletion)
            if indel in deletions: 
              deletions[indel] += 1
            else:
              deletions[indel] = 1
        elif base == '*' : # a deletion from a prior base
          if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi-1,base,qi,quals[qi],ord(quals[qi]) - 33)
          qi += 1
        elif base == '^' : #start of read (followed by read quality char)
          if debug: print >> sys.stderr, "%3d\t%s%s" % (bi-1,base,bases[bi])
          read_mapping_qual = ord(bases[bi]) - 33 
          bi += 1
        elif base == '$' : #end of read
          if debug: print >> sys.stderr, "%3d\t%s" % (bi-1,base)
          pass # no associated quality in either bases or quals
      dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp
      if debug: print >> sys.stderr, "match count: %d" % mc
      if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions)
      if dp < max(1,min_coverage) :
        continue
      # output any variants that meet the depth coverage requirement
      vcf_pos = pos
      vcf_ref = ref
      suffix = ''
      alts = []
      safs = []
      # If we have any deletions to report, need to modify the ref string
      if debug: print >> sys.stderr, "max deletions: %s" % deletions
      if report_indels:
        for k,v in deletions.items():
          saf = v/float(dp)
          if saf >= min_allele_freq:
            if len(k) > len(suffix):
              suffix = k
        vcf_ref = (ref + suffix).upper()
      if debug: print >> sys.stderr, "snps: %s" % snps
      for k,v in snps.items():
        if debug: print >> sys.stderr, "snp: %s %d" % (k,v)
        saf = v/float(dp)
        if saf >= min_allele_freq:
          alts.append(k + suffix)
          safs.append(saf)
      if debug: print >> sys.stderr, "insertions: %s" % insertions
      if report_indels:
        for k,v in insertions.items():
          saf = v/float(dp)
          if saf >= min_allele_freq:
            alts.append(ref + k + suffix)
            safs.append(saf)
        if debug: print >> sys.stderr, "deletions: %s" % deletions
        for k,v in deletions.items():
          saf = v/float(dp)
          if saf >= min_allele_freq:
            alts.append(vcf_ref[:len(vcf_ref) - len(k)])   # TODO alt will be a substring of vcf_ref,  test this
            safs.append(saf)
      if len(alts) > 0:
        vcf_id = "."
        vcf_qual = "." 
        vcf_filter = "PASS"
        # if not allow_multiples, report only the most freq alt
        if not allow_multiples and len(alts) > 1:
          max_saf = max(safs)
          for i,v in enumerate(safs):
            if v == max_saf:
              vcf_alt = alts[i]
          info = "SAF=%f;DP=%d" % (max_saf,dp)
        # if allow_multiples
        else:
          vcf_alt = ','.join(alts)
          isafs = []
          for saf in safs:
            isafs.append("SAF=%f" % saf)
          info = "%s;DP=%d" % (','.join(isafs),dp)
        print >> outputFile,"%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s" % (chrom,vcf_pos,vcf_id,vcf_ref,vcf_alt,vcf_qual,vcf_filter,info)
  except Exception, e:
    print >> sys.stderr, "failed: %s" % e
    exit(1)

if __name__ == "__main__": __main__()