# HG changeset patch # User Jim Johnson # Date 1362364723 21600 # Node ID d6de2d1f4af9e4141d17c79e6f4c9a1d0e397347 # Parent bb0ac51f1b02780d3205f02180559a5d0fde17e2 Fix read depth reporting diff -r bb0ac51f1b02 -r d6de2d1f4af9 pileup_to_vcf.py --- a/pileup_to_vcf.py Mon Feb 18 19:11:47 2013 -0600 +++ b/pileup_to_vcf.py Sun Mar 03 20:38:43 2013 -0600 @@ -50,7 +50,7 @@ parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' ) # filters parser.add_option( '-c', '--min_coverage', type='int', default='1', dest='min_coverage', help='The minimum read coverage depth to report a variant [default 1]' ) - parser.add_option( '-r', '--report_depth', type='choice', choices=['all','ref','qual'], default='ref', dest='report_depth', help='Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality' ) + parser.add_option( '-r', '--report_depth', type='choice', choices=['source','ref','qual','all'], default='ref', dest='report_depth', help='Coverage depth as: source - as reported in the ipleup source, ref - reads rcovering base, qual - reads with base call above min_base_quality, all - reads spanning base (includes indels)' ) parser.add_option( '-b', '--min_base_quality', type='int', default='0', dest='min_base_quality', help='The minimum base quality for a base call to be counted' ) parser.add_option( '-f', '--min_allele_freq', type='float', default='.5', dest='min_allele_freq', help='The minimum frequency of an allele for it to be reported (default .5)' ) parser.add_option( '-m', '--allow_multiples', action="store_true", dest='allow_multiples', default=False, help='Allow multiple alleles to be reported' ) @@ -128,15 +128,16 @@ for linenum,line in enumerate(inputFile): if debug: print >> sys.stderr, "%5d:\t%s" % (linenum,line) fields = line.split('\t') - adp = int(fields[coverage_col]) # count of all reads spanning ref base + sdp = int(fields[coverage_col]) # count of all reads spanning ref base from source coverage column # Quick check for coverage - if adp < min_coverage : + if dp_as != 'all' and sdp < min_coverage : continue bases = fields[base_call_col] - if dp_as != 'all' : - m = re.findall(ref_skip_pattern,bases) - # Quick check for coverage, pileup coverage - reference skips - if adp -len(m) < min_coverage: + m = re.findall(ref_skip_pattern,bases) + # Quick check for coverage, pileup coverage - reference skips + rdp = sdp - (len(m) if m else 0) + if dp_as == 'ref' or dp_as == 'qual': + if rdp < min_coverage: continue quals = fields[base_qual_col] chrom = fields[chrom_col] @@ -148,9 +149,10 @@ snps = {'A':0,'C':0,'G':0,'T':0,'N':0} bi = 0 # bases index qi = 0 # quals index - rdp = 0 # read coverage depth + adp = 0 # read coverage depth qdp = 0 # read coverage depth with base call quality passing minimum variants = dict() # variant -> count + mc = 0 # match count while bi < len(bases): base = bases[bi] if debug: print >> sys.stderr, "%3d\t%s\t%3d\t%s %3d" % (bi,base,qi,quals[qi],ord(quals[qi]) - 33) @@ -158,14 +160,15 @@ if base in '<>' : #reference skip (between paired reads) pass; elif base in '.,' : # match reference on forward/reverse strand - rdp += 1 + mc += 1 + adp += 1 qual = ord(quals[qi]) -33 qi += 1 # count match if qual >= min_base_quality: qdp += 1 elif base in 'ACGTNacgtn' : # SNP on forward/reverse strand - rdp += 1 + adp += 1 qual = ord(quals[qi]) -33 qi += 1 # record snp variant @@ -173,8 +176,7 @@ qdp += 1 snps[base.upper()] += 1 elif base in '+-' : # indel - rdp += 1 - qdp += 1 # no related quality score, so count it + adp += 1 m = re.match(indel_len_pattern,bases[bi:]) indel_len_str = m.group(0) # get the indel len string bi += len(indel_len_str) # increment the index by the len of the len string @@ -201,7 +203,8 @@ bi += 1 elif base == '$' : #end of read pass # no associated quality in either bases or quals - dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else adp + dp = rdp if dp_as == 'ref' else qdp if dp_as == 'qual' else sdp if dp_as == 'source' else adp + if debug: print >> sys.stderr, "match count: %d" % mc if debug: print >> sys.stderr, "snps: %s\tins: %s\tdel: %s" % (snps,insertions,deletions) if dp < max(1,min_coverage) : continue diff -r bb0ac51f1b02 -r d6de2d1f4af9 pileup_to_vcf.xml --- a/pileup_to_vcf.xml Mon Feb 18 19:11:47 2013 -0600 +++ b/pileup_to_vcf.xml Sun Mar 03 20:38:43 2013 -0600 @@ -60,9 +60,10 @@ help="Multiple alleles may be output in the VCF if the allowable frequency is below 0.5, otherwise only one will be reported"/> - + + - +