diff allele-counts.py @ 6:df3b28364cd2

allele-counts.{py,xml}: Add strand bias, documentation updates.
author nicksto <nmapsy@gmail.com>
date Wed, 09 Dec 2015 11:20:51 -0500
parents 31361191d2d2
children 411adeff1eec
line wrap: on
line diff
--- a/allele-counts.py	Thu Sep 12 11:34:23 2013 -0400
+++ b/allele-counts.py	Wed Dec 09 11:20:51 2015 -0500
@@ -1,31 +1,53 @@
 #!/usr/bin/python
-# This parses the output of Dan's "Naive Variant Detector" (previously,
-# "BAM Coverage"). It was forked from the code of "bam-coverage.py".
-# Or, to put it briefly,
-# cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
-#
-# New in this version:
-#   Made header line customizable
-#     - separate from internal column labels, which are used as dict keys
+"""
+Run with -h option or see DESCRIPTION for description.
+This script's functionality is being obsoleted by the new, and much more sanely
+written, nvc-filter.py.
+
+New in this version:
+  - Calculate strand bias
+  - Rename MINOR.FREQ.PERC to MAF
+
+Naive Variant Caller variant count parsing one-liner:
+$ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
+"""
+from __future__ import division
 import os
 import sys
+import errno
 import random
 from optparse import OptionParser
 
-COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
-COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
+COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
+  'major', 'minor', 'freq', 'bias']
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES',
+  'MAJOR', 'MINOR', 'MAF', 'BIAS']
 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
-USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
-       cat variants.vcf | %prog [options] > alleles.csv"""
+USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv
+       cat variants.vcf | %prog [options] > alleles.tsv"""
 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
   'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
   'debug_loc':'', 'seed':''}
-DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
+DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller"
+(aka "BAM Coverage") Galaxy tool. For each position reported, it counts the
+number of reads of each base, determines the major allele, minor allele (second
+most frequent variant), and number of alleles above a threshold. So currently
+it only considers SNVs (ACGT), including in the coverage figure. By default it
+reads from stdin and prints to stdout.
+Prints a tab-delimited set of statistics to stdout.
+To print output column labels, run "$ echo -n | ./allele-counts.py -H".
+The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR
+11:MINOR 12:MAF 13:BIAS,
+unless the --stranded option is used, in which case they are:
+1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG
+13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS.
+"""
 EPILOG = """Requirements:
 The input VCF must report the variants for each strand.
 The variants should be case-sensitive (e.g. all capital base letters).
-Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
-
+Strand bias: Both strands must show the same bases passing the frequency
+threshold (but not necessarily in the same order). If the site fails this test,
+the number of alleles is reported as 0."""
 
 def get_options(defaults, usage, description='', epilog=''):
   """Get options, print usage text."""
@@ -78,7 +100,7 @@
   infile = options.infile
   outfile = options.outfile
   print_header = options.print_header
-  freq_thres = options.freq_thres / 100.0
+  freq_thres = options.freq_thres / 100
   covg_thres = options.covg_thres
   stranded = options.stranded
   debug_loc = options.debug_loc
@@ -102,6 +124,7 @@
     if len(coords) > 2: print_sample = coords[2]
 
   # set infile_handle to either stdin or the input file
+  global infile_handle
   if infile == OPT_DEFAULTS.get('infile'):
     infile_handle = sys.stdin
     sys.stderr.write("Reading from standard input..\n")
@@ -112,12 +135,13 @@
       fail('Error: Input VCF file '+infile+' not found.')
 
   # set outfile_handle to either stdout or the output file
+  global outfile_handle
   if outfile == OPT_DEFAULTS.get('outfile'):
     outfile_handle = sys.stdout
   else:
     try:
       outfile_handle = open(outfile, 'w')
-    except IOError, e:
+    except IOError:
       fail('Error: The given output filename '+outfile+' could not be opened.')
 
   # Take care of column names, print header
@@ -169,13 +193,15 @@
     if debug and site_summary[0]['print']:
         print line.split('\t')[9].split(':')[-1]
 
-    print_site(outfile_handle, site_summary, COLUMNS)
+    try:
+      print_site(outfile_handle, site_summary, COLUMNS)
+    except IOError as ioe:
+      if ioe.errno == errno.EPIPE:
+        cleanup()
+        sys.exit(0)
 
   # close any open filehandles
-  if infile_handle is not sys.stdin:
-    infile_handle.close()
-  if outfile_handle is not sys.stdout:
-    outfile_handle.close()
+  cleanup()
 
   # keeps Galaxy from giving an error if there were messages on stderr
   sys.exit(0)
@@ -247,7 +273,7 @@
           +"Failed on line:\n"+line)
       try:
         variant_counts[variant] = int(float(reads))
-      except ValueError, e:
+      except ValueError:
         fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
 
     sample_counts[sample_names[i]] = variant_counts
@@ -330,15 +356,22 @@
     sample['coverage'] = coverage
     try:
       sample['major']  = ranked_bases[0][0]
-    except IndexError, e:
+    except IndexError:
       sample['major']  = '.'
     try:
       sample['minor']  = ranked_bases[1][0]
-      sample['freq']   = round(ranked_bases[1][1]/float(coverage), 5)
-    except IndexError, e:
+      sample['freq']   = round(ranked_bases[1][1]/coverage, 5)
+    except IndexError:
       sample['minor']  = '.'
       sample['freq']   = 0.0
 
+    # Now that we've decided major and minor, we can calculate strand bias
+    bias = get_strand_bias(sample, variants)
+    if bias is None:
+      sample['bias'] = '.'
+    else:
+      sample['bias'] = round(bias, 5)
+
     site_summary.append(sample)
 
   return site_summary
@@ -393,12 +426,12 @@
     print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
     for variant in variant_counts:
       print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
-        str(variant[1]/float(coverage)))
+        str(variant[1]/coverage))
 
   # remove bases below the frequency threshold
   if freq_thres > 0:
     variant_counts = [variant for variant in variant_counts
-      if variant[1]/float(coverage) >= freq_thres]
+      if variant[1]/coverage >= freq_thres]
 
   return variant_counts
 
@@ -432,6 +465,23 @@
   return allele_count
 
 
+def get_strand_bias(sample, variants):
+  """Based on method 1 (SB) of Guo et al., 2012
+  If there a denominator would be 0, there is no valid result and this will
+  return None. This occurs when there are no reads on one of the strands, or
+  when there are no minor allele reads."""
+  # using same variable names as in paper
+  a = variants.get('+'+sample['major'], 0) # forward major allele
+  b = variants.get('+'+sample['minor'], 0) # forward minor allele
+  c = variants.get('-'+sample['major'], 0) # reverse major allele
+  d = variants.get('-'+sample['minor'], 0) # reverse minor allele
+  # no minor allele reads
+  try:
+    return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d))
+  except ZeroDivisionError:
+    return None
+
+
 def print_site(filehandle, site, columns):
   """Print the output lines for one site (one per sample).
   filehandle must be open."""
@@ -442,9 +492,17 @@
 
 
 def fail(message):
+  cleanup()
   sys.stderr.write(message+'\n')
   sys.exit(1)
 
 
+def cleanup():
+  if isinstance(infile_handle, file):
+    infile_handle.close()
+  if isinstance(outfile_handle, file):
+    outfile_handle.close()
+
+
 if __name__ == "__main__":
   main()
\ No newline at end of file