comparison allele-counts.py @ 5:31361191d2d2

Uploaded tarball. Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author nick
date Thu, 12 Sep 2013 11:34:23 -0400
parents 318fdf77aa54
children df3b28364cd2
comparison
equal deleted inserted replaced
4:898eb3daab43 5:31361191d2d2
1 #!/usr/bin/python 1 #!/usr/bin/python
2 # This parses the output of Dan's "Naive Variant Detector" (previously, 2 # This parses the output of Dan's "Naive Variant Detector" (previously,
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". 3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
4 # Or, to put it briefly,
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
4 # 6 #
5 # New in this version: 7 # New in this version:
6 # Made header line customizable 8 # Made header line customizable
7 # - separate from internal column labels, which are used as dict keys 9 # - separate from internal column labels, which are used as dict keys
8 #
9 # TODO:
10 # - test handling of -c 0 (and -f 0?)
11 # - should it technically handle data lines that start with a '#'?
12 import os 10 import os
13 import sys 11 import sys
12 import random
14 from optparse import OptionParser 13 from optparse import OptionParser
15 14
16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] 15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] 16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] 17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv 18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
20 %prog [options] -i variants.vcf -o alleles.csv""" 19 cat variants.vcf | %prog [options] > alleles.csv"""
21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, 20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
22 'print_header':False, 'stdin':False} 21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
22 'debug_loc':'', 'seed':''}
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout.""" 23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
24 EPILOG = """Requirements: 24 EPILOG = """Requirements:
25 The input VCF must report the variants for each strand. 25 The input VCF must report the variants for each strand.
26 The variants should be case-sensitive (e.g. all capital base letters). 26 The variants should be case-sensitive (e.g. all capital base letters).
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0.""" 27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
38 parser.add_option('-o', '--outfile', dest='outfile', 38 parser.add_option('-o', '--outfile', dest='outfile',
39 default=defaults.get('outfile'), 39 default=defaults.get('outfile'),
40 help='Print output data to this file instead of stdout.') 40 help='Print output data to this file instead of stdout.')
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', 41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
42 default=defaults.get('freq_thres'), 42 default=defaults.get('freq_thres'),
43 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.') 43 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
44 +'= 1% frequency. Default is %default%.'))
44 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', 45 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
45 default=defaults.get('covg_thres'), 46 default=defaults.get('covg_thres'),
46 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.') 47 help=('Coverage threshold. Each site must be supported by at least this '
48 +'many reads on each strand. Otherwise the site will not be printed in '
49 +'the output. The default is %default reads per strand.'))
50 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const',
51 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'),
52 help=('Operate without a frequency threshold or coverage threshold. '
53 +'Equivalent to "-c 0 -f 0".'))
47 parser.add_option('-H', '--header', dest='print_header', action='store_const', 54 parser.add_option('-H', '--header', dest='print_header', action='store_const',
48 const=not(defaults.get('print_header')), default=defaults.get('print_header'), 55 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
49 help='Print header line. This is a #-commented line with the column labels. Off by default.') 56 help=('Print header line. This is a #-commented line with the column '
50 parser.add_option('-d', '--debug', dest='debug', action='store_true', 57 +'labels. Off by default.'))
51 default=False, 58 parser.add_option('-s', '--stranded', dest='stranded', action='store_const',
52 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.') 59 const=not(defaults.get('stranded')), default=defaults.get('stranded'),
60 help='Report variant counts by strand, in separate columns. Off by default.')
61 parser.add_option('-r', '--rand-seed', dest='seed',
62 default=defaults.get('seed'), help=('Seed for random number generator.'))
63 parser.add_option('-d', '--debug', dest='debug_loc',
64 default=defaults.get('debug_loc'),
65 help=('Turn on debug mode and specify a single site to process using UCSC '
66 +'coordinate format. You can also append a sample ID after another ":" '
67 +'to restrict it further.'))
53 68
54 (options, args) = parser.parse_args() 69 (options, args) = parser.parse_args()
55 70
56 # read in positional arguments 71 return (options, args)
57 arguments = {}
58 if options.debug:
59 if len(args) >= 1:
60 arguments['print_loc'] = args[0]
61 args.remove(args[0])
62
63 return (options, arguments)
64 72
65 73
66 def main(): 74 def main():
67 75
68 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) 76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
70 infile = options.infile 78 infile = options.infile
71 outfile = options.outfile 79 outfile = options.outfile
72 print_header = options.print_header 80 print_header = options.print_header
73 freq_thres = options.freq_thres / 100.0 81 freq_thres = options.freq_thres / 100.0
74 covg_thres = options.covg_thres 82 covg_thres = options.covg_thres
75 debug = options.debug 83 stranded = options.stranded
76 84 debug_loc = options.debug_loc
77 if debug: 85 seed = options.seed
78 print_loc = args.get('print_loc') 86
79 if print_loc: 87 if options.no_filter:
80 if ':' in print_loc: 88 freq_thres = 0
81 (print_chr, print_pos) = print_loc.split(':') 89 covg_thres = 0
82 else: 90
83 print_pos = print_loc 91 if seed:
84 else: 92 random.seed(seed)
85 sys.stderr.write("Warning: No site coordinate found in arguments. " 93
86 +"Turning off debug mode.\n") 94 debug = False
87 debug = False 95 print_sample = ''
96 if debug_loc:
97 debug = True
98 coords = debug_loc.split(':')
99 print_chr = coords[0]
100 print_pos = ''
101 if len(coords) > 1: print_pos = coords[1]
102 if len(coords) > 2: print_sample = coords[2]
88 103
89 # set infile_handle to either stdin or the input file 104 # set infile_handle to either stdin or the input file
90 if infile == OPT_DEFAULTS.get('infile'): 105 if infile == OPT_DEFAULTS.get('infile'):
91 infile_handle = sys.stdin 106 infile_handle = sys.stdin
92 sys.stderr.write("Reading from standard input..\n") 107 sys.stderr.write("Reading from standard input..\n")
103 try: 118 try:
104 outfile_handle = open(outfile, 'w') 119 outfile_handle = open(outfile, 'w')
105 except IOError, e: 120 except IOError, e:
106 fail('Error: The given output filename '+outfile+' could not be opened.') 121 fail('Error: The given output filename '+outfile+' could not be opened.')
107 122
123 # Take care of column names, print header
108 if len(COLUMNS) != len(COLUMN_LABELS): 124 if len(COLUMNS) != len(COLUMN_LABELS):
109 fail('Error: Internal column names do not match column labels.') 125 fail('Error: Internal column names list do not match column labels list.')
126 if stranded:
127 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
128 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
110 if print_header: 129 if print_header:
111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") 130 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
112 131
113 # main loop: process and print one line at a time 132 # main loop
133 # each iteration processes one VCF line and prints one or more output lines
134 # one VCF line = one site, one or more samples
135 # one output line = one site, one sample
114 sample_names = [] 136 sample_names = []
115 for line in infile_handle: 137 for line in infile_handle:
116 line = line.rstrip('\r\n') 138 line = line.rstrip('\r\n')
117 139
118 # header lines 140 # header lines
126 +"Failed on line:\n"+line) 148 +"Failed on line:\n"+line)
127 149
128 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) 150 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
129 151
130 if debug: 152 if debug:
131 if site_data['pos'] != print_pos: 153 if print_pos != site_data['pos']:
132 continue 154 continue
133 try: 155 if print_chr != site_data['chr'] and print_chr != '':
134 if site_data['chr'] != print_chr: 156 continue
135 continue 157 if print_sample != '':
136 except NameError, e: 158 for sample in site_data['samples'].keys():
137 pass # No chr specified. Just go ahead and print the line. 159 if sample.lower() != print_sample.lower():
160 site_data['samples'].pop(sample, None)
161 if len(site_data['samples']) == 0:
162 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
163 sys.exit(1)
164
138 165
139 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, 166 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
140 freq_thres, covg_thres, debug=debug) 167 freq_thres, covg_thres, stranded, debug=debug)
141 168
142 if debug and site_summary[0]['print']: 169 if debug and site_summary[0]['print']:
143 print line.split('\t')[9].split(':')[-1] 170 print line.split('\t')[9].split(':')[-1]
144 171
145 print_site(outfile_handle, site_summary, COLUMNS) 172 print_site(outfile_handle, site_summary, COLUMNS)
146 173
147 # close any open filehandles 174 # close any open filehandles
148 if infile_handle is not sys.stdin: 175 if infile_handle is not sys.stdin:
156 183
157 184
158 def read_site(line, sample_names, canonical): 185 def read_site(line, sample_names, canonical):
159 """Read in a line, parse the variants into a data structure, and return it. 186 """Read in a line, parse the variants into a data structure, and return it.
160 The line should be actual site data, not a header line, so check beforehand. 187 The line should be actual site data, not a header line, so check beforehand.
161 Notes: 188 Only the variants in 'canonical' will be read; all others are ignored.
162 - The line is assumed to have been chomped.""" 189 Note: the line is assumed to have been chomped.
190 The returned data is stored in a dict, with the following structure:
191 {
192 'chr': 'chr1',
193 'pos': '2617',
194 'samples': {
195 'THYROID': {
196 '+A': 32,
197 '-A': 45,
198 '-G': 1,
199 },
200 'BLOOD': {
201 '+A': 2,
202 '-C': 1,
203 '+G': 37,
204 '-G': 42,
205 },
206 },
207 }
208 """
163 209
164 site = {} 210 site = {}
165 fields = line.split('\t') 211 fields = line.split('\t')
166 212
167 if len(fields) < 9: 213 if len(fields) < 9:
210 256
211 return site 257 return site
212 258
213 259
214 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, 260 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
215 debug=False): 261 stranded=False, debug=False):
216 """Take the raw data from the VCF line and transform it into the summary data 262 """Take the raw data from the VCF line and transform it into the summary data
217 to be printed in the output format.""" 263 to be printed in the output format."""
218 264
219 site_summary = [] 265 site_summary = []
220 for sample_name in sample_names: 266 for sample_name in sample_names:
221 267
222 sample = {'print':False} 268 sample = {'print':False}
223 variants = site['samples'].get(sample_name) 269 variants = site['samples'].get(sample_name)
224 if not variants:
225 site_summary.append(sample)
226 continue
227 270
228 sample['sample'] = sample_name 271 sample['sample'] = sample_name
229 sample['chr'] = site['chr'] 272 sample['chr'] = site['chr']
230 sample['pos'] = site['pos'] 273 sample['pos'] = site['pos']
231 274
238 if variant[0] == '+': 281 if variant[0] == '+':
239 covg_plus += variants[variant] 282 covg_plus += variants[variant]
240 elif variant[0] == '-': 283 elif variant[0] == '-':
241 covg_minus += variants[variant] 284 covg_minus += variants[variant]
242 # stranded coverage threshold 285 # stranded coverage threshold
243 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres: 286 if covg_plus < covg_thres or covg_minus < covg_thres:
244 site_summary.append(sample) 287 site_summary.append(sample)
245 continue 288 continue
246 else: 289 else:
247 sample['print'] = True 290 sample['print'] = True
248 291
249 # get an ordered list of read counts for all variants (either strand) 292 # get an ordered list of read counts for all variants (both strands)
250 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug) 293 bases = get_read_counts(variants, '+-')
251 294 ranked_bases = process_read_counts(bases, sort=True, debug=debug)
252 # record read counts into dict for this sample 295
253 for base in ranked_bases: 296 # prepare stranded or unstranded lists of base counts
254 sample[base[0]] = base[1] 297 base_count_lists = []
255 # fill in any zeros 298 if stranded:
256 for variant in canonical: 299 strands = ['+', '-']
257 if not sample.has_key(variant): 300 base_count_lists.append(get_read_counts(variants, '+'))
258 sample[variant] = 0 301 base_count_lists.append(get_read_counts(variants, '-'))
259 302 else:
260 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) 303 strands = ['']
261 304 base_count_lists.append(ranked_bases)
262 # set minor allele to N if there's a tie for 2nd 305
306 # record read counts into output dict
307 # If stranded, this will loop twice, once for each strand, and prepend '+'
308 # or '-' to the base name. If not stranded, it will loop once, and prepend
309 # nothing ('').
310 for (strand, base_count_list) in zip(strands, base_count_lists):
311 for base_count in base_count_list:
312 sample[strand+base_count[0]] = base_count[1]
313 # fill in any zeros
314 for base in canonical:
315 if not sample.has_key(strand+base):
316 sample[strand+base] = 0
317
318 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
319
320 # If there's a tie for 2nd, randomly choose one to be 2nd
263 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: 321 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
264 ranked_bases[1] = ('N', 0) 322 swap = random.choice([True,False])
265 sample['alleles'] = 1 if sample['alleles'] else 0 323 if swap:
266 324 tmp_base = ranked_bases[1]
267 if debug: print ranked_bases 325 ranked_bases[1] = ranked_bases[2]
326 ranked_bases[2] = tmp_base
327
328 if debug: print "ranked +-: "+str(ranked_bases)
268 329
269 sample['coverage'] = coverage 330 sample['coverage'] = coverage
270 try: 331 try:
271 sample['major'] = ranked_bases[0][0] 332 sample['major'] = ranked_bases[0][0]
272 except IndexError, e: 333 except IndexError, e:
281 site_summary.append(sample) 342 site_summary.append(sample)
282 343
283 return site_summary 344 return site_summary
284 345
285 346
347 def get_read_counts(stranded_counts, strands='+-'):
348 """Do a simple sum of the read counts per variant, on the specified strands.
349 Arguments:
350 stranded_counts: Dict of the stranded variants (keys) and their read counts
351 (values).
352 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
353 Return value:
354 summed_counts: A list of the alleles and their read counts. The elements are
355 tuples (variant, read count)."""
356
357 variants = stranded_counts.keys()
358
359 summed_counts = {}
360 for variant in variants:
361 strand = variant[0]
362 base = variant[1:]
363 if strand in strands:
364 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
365
366 return summed_counts.items()
367
368
369 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
370 """Process a list of read counts by frequency filtering and/or sorting.
371 Arguments:
372 variant_counts: List of the non-stranded variants and their read counts. The
373 elements are tuples (variant, read count).
374 freq_thres: The frequency threshold each allele needs to pass to be included.
375 sort: Whether to sort the list in descending order of read counts.
376 Return value:
377 variant_counts: A list of the processed alleles and their read counts. The
378 elements are tuples (variant, read count)."""
379
380 # get coverage for the specified strands
381 coverage = 0
382 for variant in variant_counts:
383 coverage += variant[1]
384
385 if coverage <= 0:
386 return []
387
388 # sort the list of alleles by read count
389 if sort:
390 variant_counts.sort(reverse=True, key=lambda variant: variant[1])
391
392 if debug:
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
394 for variant in variant_counts:
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
396 str(variant[1]/float(coverage)))
397
398 # remove bases below the frequency threshold
399 if freq_thres > 0:
400 variant_counts = [variant for variant in variant_counts
401 if variant[1]/float(coverage) >= freq_thres]
402
403 return variant_counts
404
405
406 def count_alleles(variant_counts, freq_thres, debug=False):
407 """Determine how many alleles to report, based on filtering rules.
408 The current rule determines which bases pass the frequency threshold on each
409 strand individually, then compares the two sets of bases. If they are the same
410 (regardless of order), the allele count is the number of bases. Otherwise it
411 is zero."""
412 allele_count = 0
413
414 alleles_plus = get_read_counts(variant_counts, '+')
415 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres,
416 sort=False, debug=debug)
417 alleles_minus = get_read_counts(variant_counts, '-')
418 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
419 sort=False, debug=debug)
420
421 if debug:
422 print '+ '+str(alleles_plus)
423 print '- '+str(alleles_minus)
424
425 # Check if each strand reports the same set of alleles.
426 # Sorting by base is to compare lists without regard to order (as sets).
427 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
428 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
429 if alleles_plus_sorted == alleles_minus_sorted:
430 allele_count = len(alleles_plus)
431
432 return allele_count
433
434
286 def print_site(filehandle, site, columns): 435 def print_site(filehandle, site, columns):
287 """Print the output lines for one site (one per sample). 436 """Print the output lines for one site (one per sample).
288 filehandle must be open.""" 437 filehandle must be open."""
289 for sample in site: 438 for sample in site:
290 if sample['print']: 439 if sample['print']:
291 fields = [str(sample.get(column)) for column in columns] 440 fields = [str(sample.get(column)) for column in columns]
292 filehandle.write('\t'.join(fields)+"\n") 441 filehandle.write('\t'.join(fields)+"\n")
293 442
294 443
295 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
296 """Count the number of reads for each base, and create a ranked list of
297 alleles passing the frequency threshold.
298 Arguments:
299 variant_counts: Dict of the stranded variants (keys) and their read counts (values).
300 freq_thres: The frequency threshold each allele needs to pass to be included.
301 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
302 variants: A list of the variants of interest. Other types of variants will not
303 be included in the returned list. If no list is given, all variants found in
304 the variant_counts will be used.
305 Return value:
306 ranked_bases: A list of the alleles and their read counts. The elements are
307 tuples (base, read count). The alleles are listed in descending order of
308 frequency, and only those passing the threshold are included."""
309
310 # Get list of all variants from variant_counts list
311 variants = [variant[1:] for variant in variant_counts]
312 # deduplicate via a dict
313 variant_dict = dict((variant, 1) for variant in variants)
314 variants = variant_dict.keys()
315
316 ranked_bases = []
317 for variant in variants:
318 reads = 0
319 for strand in strands:
320 reads += variant_counts.get(strand+variant, 0)
321 ranked_bases.append((variant, reads))
322
323 # get coverage for the specified strands
324 coverage = 0
325 for variant in variant_counts:
326 if variant[0] in strands:
327 coverage += variant_counts.get(variant, 0)
328 # if debug: print "strands: "+strands+', covg: '+str(coverage)
329
330 if coverage < 1:
331 return []
332
333 # sort the list of alleles by read count
334 ranked_bases.sort(reverse=True, key=lambda base: base[1])
335
336 if debug:
337 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
338 for base in ranked_bases:
339 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
340 str(base[1]/float(coverage)))
341
342 # remove bases below the frequency threshold
343 ranked_bases = [base for base in ranked_bases
344 if base[1]/float(coverage) >= freq_thres]
345
346 return ranked_bases
347
348
349 def count_alleles(variant_counts, freq_thres, debug=False):
350 """Determine how many alleles to report, based on filtering rules.
351 The current rule determines which bases pass the frequency threshold on each
352 strand individually, then compares the two sets of bases. If they are the same
353 (regardless of order), the allele count is the number of bases. Otherwise it
354 is zero."""
355 allele_count = 0
356
357 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug,
358 strands='+')
359 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
360 strands='-')
361
362 if debug:
363 print '+ '+str(alleles_plus)
364 print '- '+str(alleles_minus)
365
366 # check if each strand reports the same set of alleles
367 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
368 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
369 if alleles_plus_sorted == alleles_minus_sorted:
370 allele_count = len(alleles_plus)
371
372 return allele_count
373
374
375 def fail(message): 444 def fail(message):
376 sys.stderr.write(message+'\n') 445 sys.stderr.write(message+'\n')
377 sys.exit(1) 446 sys.exit(1)
378 447
448
379 if __name__ == "__main__": 449 if __name__ == "__main__":
380 main() 450 main()