# HG changeset patch
# User nicksto <nmapsy@gmail.com>
# Date 1449678051 18000
# Node ID df3b28364cd2d97e6fba262dd32c3358792afb08
# Parent  31361191d2d29b37a1f823231ef91678b0567b10
allele-counts.{py,xml}: Add strand bias, documentation updates.

diff -r 31361191d2d2 -r df3b28364cd2 allele-counts.py
--- a/allele-counts.py	Thu Sep 12 11:34:23 2013 -0400
+++ b/allele-counts.py	Wed Dec 09 11:20:51 2015 -0500
@@ -1,31 +1,53 @@
 #!/usr/bin/python
-# This parses the output of Dan's "Naive Variant Detector" (previously,
-# "BAM Coverage"). It was forked from the code of "bam-coverage.py".
-# Or, to put it briefly,
-# cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
-#
-# New in this version:
-#   Made header line customizable
-#     - separate from internal column labels, which are used as dict keys
+"""
+Run with -h option or see DESCRIPTION for description.
+This script's functionality is being obsoleted by the new, and much more sanely
+written, nvc-filter.py.
+
+New in this version:
+  - Calculate strand bias
+  - Rename MINOR.FREQ.PERC to MAF
+
+Naive Variant Caller variant count parsing one-liner:
+$ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
+"""
+from __future__ import division
 import os
 import sys
+import errno
 import random
 from optparse import OptionParser
 
-COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
-COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
+COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
+  'major', 'minor', 'freq', 'bias']
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES',
+  'MAJOR', 'MINOR', 'MAF', 'BIAS']
 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
-USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
-       cat variants.vcf | %prog [options] > alleles.csv"""
+USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv
+       cat variants.vcf | %prog [options] > alleles.tsv"""
 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
   'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
   'debug_loc':'', 'seed':''}
-DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
+DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller"
+(aka "BAM Coverage") Galaxy tool. For each position reported, it counts the
+number of reads of each base, determines the major allele, minor allele (second
+most frequent variant), and number of alleles above a threshold. So currently
+it only considers SNVs (ACGT), including in the coverage figure. By default it
+reads from stdin and prints to stdout.
+Prints a tab-delimited set of statistics to stdout.
+To print output column labels, run "$ echo -n | ./allele-counts.py -H".
+The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR
+11:MINOR 12:MAF 13:BIAS,
+unless the --stranded option is used, in which case they are:
+1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG
+13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS.
+"""
 EPILOG = """Requirements:
 The input VCF must report the variants for each strand.
 The variants should be case-sensitive (e.g. all capital base letters).
-Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
-
+Strand bias: Both strands must show the same bases passing the frequency
+threshold (but not necessarily in the same order). If the site fails this test,
+the number of alleles is reported as 0."""
 
 def get_options(defaults, usage, description='', epilog=''):
   """Get options, print usage text."""
@@ -78,7 +100,7 @@
   infile = options.infile
   outfile = options.outfile
   print_header = options.print_header
-  freq_thres = options.freq_thres / 100.0
+  freq_thres = options.freq_thres / 100
   covg_thres = options.covg_thres
   stranded = options.stranded
   debug_loc = options.debug_loc
@@ -102,6 +124,7 @@
     if len(coords) > 2: print_sample = coords[2]
 
   # set infile_handle to either stdin or the input file
+  global infile_handle
   if infile == OPT_DEFAULTS.get('infile'):
     infile_handle = sys.stdin
     sys.stderr.write("Reading from standard input..\n")
@@ -112,12 +135,13 @@
       fail('Error: Input VCF file '+infile+' not found.')
 
   # set outfile_handle to either stdout or the output file
+  global outfile_handle
   if outfile == OPT_DEFAULTS.get('outfile'):
     outfile_handle = sys.stdout
   else:
     try:
       outfile_handle = open(outfile, 'w')
-    except IOError, e:
+    except IOError:
       fail('Error: The given output filename '+outfile+' could not be opened.')
 
   # Take care of column names, print header
@@ -169,13 +193,15 @@
     if debug and site_summary[0]['print']:
         print line.split('\t')[9].split(':')[-1]
 
-    print_site(outfile_handle, site_summary, COLUMNS)
+    try:
+      print_site(outfile_handle, site_summary, COLUMNS)
+    except IOError as ioe:
+      if ioe.errno == errno.EPIPE:
+        cleanup()
+        sys.exit(0)
 
   # close any open filehandles
-  if infile_handle is not sys.stdin:
-    infile_handle.close()
-  if outfile_handle is not sys.stdout:
-    outfile_handle.close()
+  cleanup()
 
   # keeps Galaxy from giving an error if there were messages on stderr
   sys.exit(0)
@@ -247,7 +273,7 @@
           +"Failed on line:\n"+line)
       try:
         variant_counts[variant] = int(float(reads))
-      except ValueError, e:
+      except ValueError:
         fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
 
     sample_counts[sample_names[i]] = variant_counts
@@ -330,15 +356,22 @@
     sample['coverage'] = coverage
     try:
       sample['major']  = ranked_bases[0][0]
-    except IndexError, e:
+    except IndexError:
       sample['major']  = '.'
     try:
       sample['minor']  = ranked_bases[1][0]
-      sample['freq']   = round(ranked_bases[1][1]/float(coverage), 5)
-    except IndexError, e:
+      sample['freq']   = round(ranked_bases[1][1]/coverage, 5)
+    except IndexError:
       sample['minor']  = '.'
       sample['freq']   = 0.0
 
+    # Now that we've decided major and minor, we can calculate strand bias
+    bias = get_strand_bias(sample, variants)
+    if bias is None:
+      sample['bias'] = '.'
+    else:
+      sample['bias'] = round(bias, 5)
+
     site_summary.append(sample)
 
   return site_summary
@@ -393,12 +426,12 @@
     print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
     for variant in variant_counts:
       print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
-        str(variant[1]/float(coverage)))
+        str(variant[1]/coverage))
 
   # remove bases below the frequency threshold
   if freq_thres > 0:
     variant_counts = [variant for variant in variant_counts
-      if variant[1]/float(coverage) >= freq_thres]
+      if variant[1]/coverage >= freq_thres]
 
   return variant_counts
 
@@ -432,6 +465,23 @@
   return allele_count
 
 
+def get_strand_bias(sample, variants):
+  """Based on method 1 (SB) of Guo et al., 2012
+  If there a denominator would be 0, there is no valid result and this will
+  return None. This occurs when there are no reads on one of the strands, or
+  when there are no minor allele reads."""
+  # using same variable names as in paper
+  a = variants.get('+'+sample['major'], 0) # forward major allele
+  b = variants.get('+'+sample['minor'], 0) # forward minor allele
+  c = variants.get('-'+sample['major'], 0) # reverse major allele
+  d = variants.get('-'+sample['minor'], 0) # reverse minor allele
+  # no minor allele reads
+  try:
+    return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d))
+  except ZeroDivisionError:
+    return None
+
+
 def print_site(filehandle, site, columns):
   """Print the output lines for one site (one per sample).
   filehandle must be open."""
@@ -442,9 +492,17 @@
 
 
 def fail(message):
+  cleanup()
   sys.stderr.write(message+'\n')
   sys.exit(1)
 
 
+def cleanup():
+  if isinstance(infile_handle, file):
+    infile_handle.close()
+  if isinstance(outfile_handle, file):
+    outfile_handle.close()
+
+
 if __name__ == "__main__":
   main()
\ No newline at end of file
diff -r 31361191d2d2 -r df3b28364cd2 allele-counts.xml
--- a/allele-counts.xml	Thu Sep 12 11:34:23 2013 -0400
+++ b/allele-counts.xml	Wed Dec 09 11:20:51 2015 -0500
@@ -1,13 +1,14 @@
-<tool id="allele_counts_1" version="1.1" name="Variant Annotator">
+<tool id="allele_counts_1" version="1.2" name="Variant Annotator">
   <description> process variant counts</description>
-  <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt</command>
+  <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt -r $seed</command>
   <inputs>
     <param name="input" type="data" format="vcf" label="Input variants from Naive Variants Detector"/>
-    <param name="freq" type="float" value="1.0" min="0" max="100" label="Minor allele frequency threshold (in percent)"/>
-    <param name="covg" type="integer" value="10" min="0" label="Coverage threshold (in reads per strand)"/>
+    <param name="freq" type="float" value="1.0" min="0" max="100" label="Minor allele frequency threshold" help="in percent"/>
+    <param name="covg" type="integer" value="10" min="0" label="Coverage threshold" help="in reads (per strand)"/>
     <param name="nofilt" type="boolean" truevalue="-n" falsevalue="" checked="False" label="Do not filter sites or alleles" />
     <param name="stranded" type="boolean" truevalue="-s" falsevalue="" checked="False" label="Output stranded base counts" />
     <param name="header" type="boolean" truevalue="-H" falsevalue="" checked="True" label="Write header line" />
+    <param name="seed" type="text" value="" label="PRNG seed" />
   </inputs>
   <outputs>
     <data name="output" format="tabular"/>
@@ -17,6 +18,16 @@
     <exit_code range=":-1" err_level="fatal"/>
   </stdio>
 
+  <tests>
+    <test>
+      <param name="input" value="tests/artificial.vcf.in" />
+      <param name="freq" value="10" />
+      <param name="covg" value="10" />
+      <param name="seed" value="1" />
+      <output name="output" file="tests/artificial.csv.out" />
+    </test>
+  </tests>
+
   <help>
 
 .. class:: infomark
@@ -45,7 +56,7 @@
 
 **Output**
 
-Each row represents one site in one sample. For unstranded output, 12 fields give information about that site::
+Each row represents one site in one sample. For **unstranded** output, 13 fields give information about that site::
 
     1.  SAMPLE  - Sample name (from VCF sample column labels)
     2.  CHR     - Chromosome of the site
@@ -58,23 +69,24 @@
     9.  ALLELES - Number of qualifying alleles
     10. MAJOR   - Major allele
     11. MINOR   - Minor allele (2nd most prevalent variant)
-    12. MINOR.FREQ.PERC. - Frequency of minor allele
+    12. MAF     - Frequency of minor allele
+    13. BIAS    - Strand bias measure
 
 For stranded output, instead of using 4 columns to report read counts per base, 8 are used to report the stranded counts per base::
 
-    1       2   3   4  5  6  7  8  9 10 11  12    13     14    15         16
-    SAMPLE CHR POS +A +C +G +T -A -C -G -T CVRG ALLELES MAJOR MINOR MINOR.FREQ.PERC.
+    1       2   3   4  5  6  7  8  9 10 11  12    13     14    15   16   17
+    SAMPLE CHR POS +A +C +G +T -A -C -G -T CVRG ALLELES MAJOR MINOR MAF BIAS
 
 **Example**
 
 Below is a header line, followed by some example data lines. Since the input contained three samples, the data for each site is reported on three consecutive lines. However, if a sample fell below the coverage threshold at that site, the line will be omitted::
 
-    #SAMPLE  CHR    POS  A   C    G    T  CVRG  ALLELES  MAJOR  MINOR  MINOR.FREQ.PERC.
-    BLOOD_1  chr20  99   0   101  1    2  104   1        C      T      0.01923
-    BLOOD_2  chr20  99   82  44   0    1  127   2        A      C      0.34646
-    BLOOD_3  chr20  99   0   110  1    0  111   1        C      G      0.009
-    BLOOD_1  chr20  100  3   5    100  0  108   1        G      C      0.0463
-    BLOOD_3  chr20  100  1   118  11   0  130   0        C      G      0.08462
+    #SAMPLE  CHR    POS  A   C    G    T  CVRG  ALLELES  MAJOR  MINOR  MAF      BIAS
+    BLOOD_1  chr20  99   0   101  1    2  104   1        C      T      0.01923  0.33657
+    BLOOD_2  chr20  99   82  44   0    1  127   2        A      C      0.34646  0.07823
+    BLOOD_3  chr20  99   0   110  1    0  111   1        C      G      0.009    1.00909
+    BLOOD_1  chr20  100  3   5    100  0  108   1        G      C      0.0463   0.15986
+    BLOOD_3  chr20  100  1   118  11   0  130   0        C      G      0.08462  0.04154
 
 -----
 
@@ -94,6 +106,8 @@
 
 The alleles passing the threshold on each strand must match (though not in order), or the allele count will be 0. So a site with A, C, G on the plus strand and A, G on the minus strand will get an allele count of zero, though the (strand-independent) major allele, minor allele, and minor allele frequency will still be reported. If there is a tie for the minor allele, one will be randomly chosen.
 
+Additionally, a measure of strand bias is given in the last column. This is calculated using the method of Guo et al., 2012. A value of "." is given when there is no valid result of the calculation due to a zero denominator. This occurs when there are no reads on one of the strands, or when there is no minor allele.
+
   </help>
 
-</tool>
\ No newline at end of file
+</tool>