diff pileup_to_vcf.py @ 2:d6de2d1f4af9

Fix read depth reporting
author Jim Johnson <jj@umn.edu>
date Sun, 03 Mar 2013 20:38:43 -0600
parents bb0ac51f1b02
children aa76c8dd97e6
line wrap: on
line diff
--- a/pileup_to_vcf.py	Mon Feb 18 19:11:47 2013 -0600
+++ b/pileup_to_vcf.py	Sun Mar 03 20:38:43 2013 -0600
@@ -50,7 +50,7 @@
   parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' )
   # filters
   parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' )
-  parser.add_option( '-r', '--report_depth', type='choice', choices=['all','ref','qual'], default='ref', dest='report_depth', help='Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality' )
+  parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' )
   parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' )
   parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' )
   parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' )
@@ -128,15 +128,16 @@
     for linenum,line in enumerate(inputFile):
       if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line)
       fields = line.split('\t')
-      adp = int(fields[coverage_col]) # count of all reads spanning ref base
+      sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column
       # Quick check for coverage
-      if adp < min_coverage :  
+      if dp_as != 'all' and sdp < min_coverage :  
         continue
       bases = fields[base_call_col]
-      if dp_as != 'all' :
-        m = re.findall(ref_skip_pattern,bases) 
-        # Quick check for coverage, pileup coverage - reference skips
-        if adp -len(m) < min_coverage:
+      m = re.findall(ref_skip_pattern,bases) 
+      # Quick check for coverage, pileup coverage - reference skips
+      rdp = sdp - (len(m) if m else 0)
+      if dp_as == 'ref'  or dp_as == 'qual':
+        if rdp < min_coverage:
           continue
       quals = fields[base_qual_col]
       chrom = fields[chrom_col]
@@ -148,9 +149,10 @@
       snps = {'A':0,'C':0,'G':0,'T':0,'N':0}
       bi = 0	# bases index
       qi = 0	# quals index
-      rdp = 0	# read coverage depth
+      adp = 0	# read coverage depth
       qdp = 0	# read coverage depth with base call quality passing minimum
       variants = dict() # variant -> count
+      mc = 0 # match count
       while bi < len(bases):
         base = bases[bi] 
         if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33)
@@ -158,14 +160,15 @@
         if base in '<>' :	#reference skip (between paired reads)
           pass;
         elif base in '.,' : # match reference on forward/reverse strand
-          rdp += 1
+          mc += 1
+          adp += 1
           qual = ord(quals[qi]) -33
           qi += 1
           # count match 
           if qual >= min_base_quality:
             qdp += 1
         elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand
-          rdp += 1
+          adp += 1
           qual = ord(quals[qi]) -33
           qi += 1
           # record snp variant
@@ -173,8 +176,7 @@
             qdp += 1
             snps[base.upper()] += 1
         elif base in '+-' : # indel 
-          rdp += 1
-          qdp += 1 # no related quality score, so count it
+          adp += 1
           m = re.match(indel_len_pattern,bases[bi:]) 
           indel_len_str = m.group(0) # get the indel len string
           bi += len(indel_len_str) # increment the index by the len of the len string
@@ -201,7 +203,8 @@
           bi += 1
         elif base == '$' : #end of read
           pass # no associated quality in either bases or quals
-      dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else adp
+      dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp
+      if debug: print >> sys.stderr, "match count: %d" % mc
       if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions)
       if dp < max(1,min_coverage) :
         continue