# HG changeset patch # User nick # Date 1370018028 14400 # Node ID 318fdf77aa5483c859eedfd0ebadaa526f2c4379 # Parent 49bb46c3a1afd5c8493dc18cfc1763a7f509da93 Current version, from another toolshed Includes change in header line diff -r 49bb46c3a1af -r 318fdf77aa54 allele-counts.py --- a/allele-counts.py Fri May 24 10:33:35 2013 -0400 +++ b/allele-counts.py Fri May 31 12:33:48 2013 -0400 @@ -2,8 +2,9 @@ # This parses the output of Dan's "Naive Variant Detector" (previously, # "BAM Coverage"). It was forked from the code of "bam-coverage.py". # -# New in this version: default to stdin and stdout, override by using -i and -o -# to specify filenames +# New in this version: +# Made header line customizable +# - separate from internal column labels, which are used as dict keys # # TODO: # - test handling of -c 0 (and -f 0?) @@ -12,8 +13,8 @@ import sys from optparse import OptionParser -COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', - 'major', 'minor', 'freq'] #, 'bias'] +COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] +COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv %prog [options] -i variants.vcf -o alleles.csv""" @@ -99,13 +100,15 @@ if outfile == OPT_DEFAULTS.get('outfile'): outfile_handle = sys.stdout else: - if os.path.exists(outfile): - fail('Error: The given output filename '+outfile+' already exists.') - else: + try: outfile_handle = open(outfile, 'w') + except IOError, e: + fail('Error: The given output filename '+outfile+' could not be opened.') + if len(COLUMNS) != len(COLUMN_LABELS): + fail('Error: Internal column names do not match column labels.') if print_header: - outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n") + outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") # main loop: process and print one line at a time sample_names = [] @@ -197,9 +200,9 @@ fail("Error in input VCF: variant data not strand-specific. " +"Failed on line:\n"+line) try: - variant_counts[variant] = int(reads) + variant_counts[variant] = int(float(reads)) except ValueError, e: - continue + fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line) sample_counts[sample_names[i]] = variant_counts @@ -270,7 +273,7 @@ sample['major'] = '.' try: sample['minor'] = ranked_bases[1][0] - sample['freq'] = ranked_bases[1][1] / float(coverage) + sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5) except IndexError, e: sample['minor'] = '.' sample['freq'] = 0.0