diff allele-counts.py @ 2:318fdf77aa54

Current version, from another toolshed Includes change in header line
author nick
date Fri, 31 May 2013 12:33:48 -0400
parents 49bb46c3a1af
children 31361191d2d2
line wrap: on
line diff
--- a/allele-counts.py	Fri May 24 10:33:35 2013 -0400
+++ b/allele-counts.py	Fri May 31 12:33:48 2013 -0400
@@ -2,8 +2,9 @@
 # This parses the output of Dan's "Naive Variant Detector" (previously,
 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
 #
-# New in this version: default to stdin and stdout, override by using -i and -o
-#   to specify filenames
+# New in this version:
+#   Made header line customizable
+#     - separate from internal column labels, which are used as dict keys
 #
 # TODO:
 # - test handling of -c 0 (and -f 0?)
@@ -12,8 +13,8 @@
 import sys
 from optparse import OptionParser
 
-COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
-  'major', 'minor', 'freq'] #, 'bias']
+COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
        %prog [options] -i variants.vcf -o alleles.csv"""
@@ -99,13 +100,15 @@
   if outfile == OPT_DEFAULTS.get('outfile'):
     outfile_handle = sys.stdout
   else:
-    if os.path.exists(outfile):
-      fail('Error: The given output filename '+outfile+' already exists.')
-    else:
+    try:
       outfile_handle = open(outfile, 'w')
+    except IOError, e:
+      fail('Error: The given output filename '+outfile+' could not be opened.')
 
+  if len(COLUMNS) != len(COLUMN_LABELS):
+    fail('Error: Internal column names do not match column labels.')
   if print_header:
-    outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n")
+    outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
 
   # main loop: process and print one line at a time
   sample_names = []
@@ -197,9 +200,9 @@
         fail("Error in input VCF: variant data not strand-specific. "
           +"Failed on line:\n"+line)
       try:
-        variant_counts[variant] = int(reads)
+        variant_counts[variant] = int(float(reads))
       except ValueError, e:
-        continue
+        fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
 
     sample_counts[sample_names[i]] = variant_counts
 
@@ -270,7 +273,7 @@
       sample['major']  = '.'
     try:
       sample['minor']  = ranked_bases[1][0]
-      sample['freq']   = ranked_bases[1][1] / float(coverage)
+      sample['freq']   = round(ranked_bases[1][1]/float(coverage), 5)
     except IndexError, e:
       sample['minor']  = '.'
       sample['freq']   = 0.0