comparison allele-counts.py @ 2:318fdf77aa54

Current version, from another toolshed Includes change in header line
author nick
date Fri, 31 May 2013 12:33:48 -0400
parents 49bb46c3a1af
children 31361191d2d2
comparison
equal deleted inserted replaced
1:49bb46c3a1af 2:318fdf77aa54
1 #!/usr/bin/python 1 #!/usr/bin/python
2 # This parses the output of Dan's "Naive Variant Detector" (previously, 2 # This parses the output of Dan's "Naive Variant Detector" (previously,
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". 3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
4 # 4 #
5 # New in this version: default to stdin and stdout, override by using -i and -o 5 # New in this version:
6 # to specify filenames 6 # Made header line customizable
7 # - separate from internal column labels, which are used as dict keys
7 # 8 #
8 # TODO: 9 # TODO:
9 # - test handling of -c 0 (and -f 0?) 10 # - test handling of -c 0 (and -f 0?)
10 # - should it technically handle data lines that start with a '#'? 11 # - should it technically handle data lines that start with a '#'?
11 import os 12 import os
12 import sys 13 import sys
13 from optparse import OptionParser 14 from optparse import OptionParser
14 15
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
16 'major', 'minor', 'freq'] #, 'bias'] 17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] 18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
18 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv 19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
19 %prog [options] -i variants.vcf -o alleles.csv""" 20 %prog [options] -i variants.vcf -o alleles.csv"""
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, 21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
21 'print_header':False, 'stdin':False} 22 'print_header':False, 'stdin':False}
97 98
98 # set outfile_handle to either stdout or the output file 99 # set outfile_handle to either stdout or the output file
99 if outfile == OPT_DEFAULTS.get('outfile'): 100 if outfile == OPT_DEFAULTS.get('outfile'):
100 outfile_handle = sys.stdout 101 outfile_handle = sys.stdout
101 else: 102 else:
102 if os.path.exists(outfile): 103 try:
103 fail('Error: The given output filename '+outfile+' already exists.')
104 else:
105 outfile_handle = open(outfile, 'w') 104 outfile_handle = open(outfile, 'w')
106 105 except IOError, e:
106 fail('Error: The given output filename '+outfile+' could not be opened.')
107
108 if len(COLUMNS) != len(COLUMN_LABELS):
109 fail('Error: Internal column names do not match column labels.')
107 if print_header: 110 if print_header:
108 outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n") 111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
109 112
110 # main loop: process and print one line at a time 113 # main loop: process and print one line at a time
111 sample_names = [] 114 sample_names = []
112 for line in infile_handle: 115 for line in infile_handle:
113 line = line.rstrip('\r\n') 116 line = line.rstrip('\r\n')
195 continue 198 continue
196 if variant[0] != '-' and variant[0] != '+': 199 if variant[0] != '-' and variant[0] != '+':
197 fail("Error in input VCF: variant data not strand-specific. " 200 fail("Error in input VCF: variant data not strand-specific. "
198 +"Failed on line:\n"+line) 201 +"Failed on line:\n"+line)
199 try: 202 try:
200 variant_counts[variant] = int(reads) 203 variant_counts[variant] = int(float(reads))
201 except ValueError, e: 204 except ValueError, e:
202 continue 205 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
203 206
204 sample_counts[sample_names[i]] = variant_counts 207 sample_counts[sample_names[i]] = variant_counts
205 208
206 site['samples'] = sample_counts 209 site['samples'] = sample_counts
207 210
268 sample['major'] = ranked_bases[0][0] 271 sample['major'] = ranked_bases[0][0]
269 except IndexError, e: 272 except IndexError, e:
270 sample['major'] = '.' 273 sample['major'] = '.'
271 try: 274 try:
272 sample['minor'] = ranked_bases[1][0] 275 sample['minor'] = ranked_bases[1][0]
273 sample['freq'] = ranked_bases[1][1] / float(coverage) 276 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
274 except IndexError, e: 277 except IndexError, e:
275 sample['minor'] = '.' 278 sample['minor'] = '.'
276 sample['freq'] = 0.0 279 sample['freq'] = 0.0
277 280
278 site_summary.append(sample) 281 site_summary.append(sample)