allele_counts: allele-counts.py comparison

comparison allele-counts.py @ 2:318fdf77aa54

Current version, from another toolshed Includes change in header line

author	nick
date	Fri, 31 May 2013 12:33:48 -0400
parents	49bb46c3a1af
children	31361191d2d2

comparison

equal deleted inserted replaced

-:49bb46c3a1af
+:318fdf77aa54
 #!/usr/bin/python
 # This parses the output of Dan's "Naive Variant Detector" (previously,
 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
 #
-# New in this version: default to stdin and stdout, override by using -i and -o
+# New in this version:
-#   to specify filenames
+#   Made header line customizable
+#     - separate from internal column labels, which are used as dict keys
 #
 # TODO:
 # - test handling of -c 0 (and -f 0?)
 # - should it technically handle data lines that start with a '#'?
 import os
 import sys
 from optparse import OptionParser
-COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
+COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
-'major', 'minor', 'freq'] #, 'bias']
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
 %prog [options] -i variants.vcf -o alleles.csv"""
 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
 'print_header':False, 'stdin':False}
 # set outfile_handle to either stdout or the output file
 if outfile == OPT_DEFAULTS.get('outfile'):
 outfile_handle = sys.stdout
 else:
-if os.path.exists(outfile):
+try:
-fail('Error: The given output filename '+outfile+' already exists.')
-else:
 outfile_handle = open(outfile, 'w')
+except IOError, e:
+fail('Error: The given output filename '+outfile+' could not be opened.')
+if len(COLUMNS) != len(COLUMN_LABELS):
+fail('Error: Internal column names do not match column labels.')
 if print_header:
-outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n")
+outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
 # main loop: process and print one line at a time
 sample_names = []
 for line in infile_handle:
 line = line.rstrip('\r\n')
 continue
 if variant[0] != '-' and variant[0] != '+':
 fail("Error in input VCF: variant data not strand-specific. "
 +"Failed on line:\n"+line)
 try:
-variant_counts[variant] = int(reads)
+variant_counts[variant] = int(float(reads))
 except ValueError, e:
-continue
+fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
 sample_counts[sample_names[i]] = variant_counts
 site['samples'] = sample_counts
 sample['major']  = ranked_bases[0][0]
 except IndexError, e:
 sample['major']  = '.'
 try:
 sample['minor']  = ranked_bases[1][0]
-sample['freq']   = ranked_bases[1][1] / float(coverage)
+sample['freq']   = round(ranked_bases[1][1]/float(coverage), 5)
 except IndexError, e:
 sample['minor']  = '.'
 sample['freq']   = 0.0
 site_summary.append(sample)

Mercurial > repos > nick > allele_counts

comparison allele-counts.py @ 2:318fdf77aa54