annotate allele-counts.py @ 1:49bb46c3a1af

Uploaded script
author nick
date Fri, 24 May 2013 10:33:35 -0400
parents
children 318fdf77aa54
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/python
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
4 #
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
5 # New in this version: default to stdin and stdout, override by using -i and -o
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
6 # to specify filenames
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
7 #
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
8 # TODO:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
9 # - test handling of -c 0 (and -f 0?)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
10 # - should it technically handle data lines that start with a '#'?
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
11 import os
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
12 import sys
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
13 from optparse import OptionParser
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
14
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
16 'major', 'minor', 'freq'] #, 'bias']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
18 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
19 %prog [options] -i variants.vcf -o alleles.csv"""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
21 'print_header':False, 'stdin':False}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
22 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
23 EPILOG = """Requirements:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
24 The input VCF must report the variants for each strand.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
25 The variants should be case-sensitive (e.g. all capital base letters).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
26 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
27
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
28
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
29 def get_options(defaults, usage, description='', epilog=''):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
30 """Get options, print usage text."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
31
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
32 parser = OptionParser(usage=usage, description=description, epilog=epilog)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
33
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
34 parser.add_option('-i', '--infile', dest='infile',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
35 default=defaults.get('infile'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
36 help='Read input VCF data from this file instead of stdin.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
37 parser.add_option('-o', '--outfile', dest='outfile',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
38 default=defaults.get('outfile'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
39 help='Print output data to this file instead of stdout.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
40 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
41 default=defaults.get('freq_thres'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
42 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
43 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
44 default=defaults.get('covg_thres'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
45 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
46 parser.add_option('-H', '--header', dest='print_header', action='store_const',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
47 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
48 help='Print header line. This is a #-commented line with the column labels. Off by default.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
49 parser.add_option('-d', '--debug', dest='debug', action='store_true',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
50 default=False,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
51 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
52
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
53 (options, args) = parser.parse_args()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
54
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
55 # read in positional arguments
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
56 arguments = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
57 if options.debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
58 if len(args) >= 1:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
59 arguments['print_loc'] = args[0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
60 args.remove(args[0])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
61
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
62 return (options, arguments)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
63
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
64
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
65 def main():
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
66
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
67 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
68
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
69 infile = options.infile
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
70 outfile = options.outfile
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
71 print_header = options.print_header
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
72 freq_thres = options.freq_thres / 100.0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
73 covg_thres = options.covg_thres
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
74 debug = options.debug
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
75
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
76 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
77 print_loc = args.get('print_loc')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
78 if print_loc:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
79 if ':' in print_loc:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
80 (print_chr, print_pos) = print_loc.split(':')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
81 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
82 print_pos = print_loc
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
83 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
84 sys.stderr.write("Warning: No site coordinate found in arguments. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
85 +"Turning off debug mode.\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
86 debug = False
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
87
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
88 # set infile_handle to either stdin or the input file
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
89 if infile == OPT_DEFAULTS.get('infile'):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
90 infile_handle = sys.stdin
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
91 sys.stderr.write("Reading from standard input..\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
92 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
93 if os.path.exists(infile):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
94 infile_handle = open(infile, 'r')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
95 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
96 fail('Error: Input VCF file '+infile+' not found.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
97
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
98 # set outfile_handle to either stdout or the output file
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
99 if outfile == OPT_DEFAULTS.get('outfile'):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
100 outfile_handle = sys.stdout
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
101 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
102 if os.path.exists(outfile):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
103 fail('Error: The given output filename '+outfile+' already exists.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
104 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
105 outfile_handle = open(outfile, 'w')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
106
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
107 if print_header:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
108 outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
109
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
110 # main loop: process and print one line at a time
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
111 sample_names = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
112 for line in infile_handle:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
113 line = line.rstrip('\r\n')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
114
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
115 # header lines
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
116 if line[0] == '#':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
117 if line[0:6].upper() == '#CHROM':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
118 sample_names = line.split('\t')[9:]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
119 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
120
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
121 if not sample_names:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
122 fail("Error in input VCF: Data line encountered before header line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
123 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
124
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
125 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
126
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
127 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
128 if site_data['pos'] != print_pos:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
129 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
130 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
131 if site_data['chr'] != print_chr:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
132 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
133 except NameError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
134 pass # No chr specified. Just go ahead and print the line.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
135
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
136 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
137 freq_thres, covg_thres, debug=debug)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
138
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
139 if debug and site_summary[0]['print']:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
140 print line.split('\t')[9].split(':')[-1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
141
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
142 print_site(outfile_handle, site_summary, COLUMNS)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
143
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
144 # close any open filehandles
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
145 if infile_handle is not sys.stdin:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
146 infile_handle.close()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
147 if outfile_handle is not sys.stdout:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
148 outfile_handle.close()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
149
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
150 # keeps Galaxy from giving an error if there were messages on stderr
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
151 sys.exit(0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
152
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
153
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
154
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
155 def read_site(line, sample_names, canonical):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
156 """Read in a line, parse the variants into a data structure, and return it.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
157 The line should be actual site data, not a header line, so check beforehand.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
158 Notes:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
159 - The line is assumed to have been chomped."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
160
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
161 site = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
162 fields = line.split('\t')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
163
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
164 if len(fields) < 9:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
165 fail("Error in input VCF: wrong number of fields in data line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
166 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
167
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
168 site['chr'] = fields[0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
169 site['pos'] = fields[1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
170 samples = fields[9:]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
171
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
172 if len(samples) < len(sample_names):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
173 fail("Error in input VCF: missing sample fields in data line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
174 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
175 elif len(samples) > len(sample_names):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
176 fail("Error in input VCF: more sample fields in data line than in header. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
177 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
178
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
179 sample_counts = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
180 for i in range(len(samples)):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
181
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
182 variant_counts = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
183 counts = samples[i].split(':')[-1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
184 counts = counts.split(',')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
185
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
186 for count in counts:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
187 if not count:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
188 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
189 fields = count.split('=')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
190 if len(fields) != 2:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
191 fail("Error in input VCF: Incorrect variant data format (must contain "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
192 +"a single '='). Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
193 (variant, reads) = fields
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
194 if variant[1:] not in canonical:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
195 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
196 if variant[0] != '-' and variant[0] != '+':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
197 fail("Error in input VCF: variant data not strand-specific. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
198 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
199 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
200 variant_counts[variant] = int(reads)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
201 except ValueError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
202 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
203
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
204 sample_counts[sample_names[i]] = variant_counts
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
205
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
206 site['samples'] = sample_counts
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
207
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
208 return site
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
209
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
210
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
211 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
212 debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
213 """Take the raw data from the VCF line and transform it into the summary data
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
214 to be printed in the output format."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
215
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
216 site_summary = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
217 for sample_name in sample_names:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
218
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
219 sample = {'print':False}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
220 variants = site['samples'].get(sample_name)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
221 if not variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
222 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
223 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
224
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
225 sample['sample'] = sample_name
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
226 sample['chr'] = site['chr']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
227 sample['pos'] = site['pos']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
228
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
229 coverage = sum(variants.values())
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
230
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
231 # get stranded coverage
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
232 covg_plus = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
233 covg_minus = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
234 for variant in variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
235 if variant[0] == '+':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
236 covg_plus += variants[variant]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
237 elif variant[0] == '-':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
238 covg_minus += variants[variant]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
239 # stranded coverage threshold
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
240 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
241 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
242 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
243 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
244 sample['print'] = True
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
245
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
246 # get an ordered list of read counts for all variants (either strand)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
247 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
248
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
249 # record read counts into dict for this sample
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
250 for base in ranked_bases:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
251 sample[base[0]] = base[1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
252 # fill in any zeros
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
253 for variant in canonical:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
254 if not sample.has_key(variant):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
255 sample[variant] = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
256
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
257 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
258
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
259 # set minor allele to N if there's a tie for 2nd
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
260 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
261 ranked_bases[1] = ('N', 0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
262 sample['alleles'] = 1 if sample['alleles'] else 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
263
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
264 if debug: print ranked_bases
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
265
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
266 sample['coverage'] = coverage
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
267 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
268 sample['major'] = ranked_bases[0][0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
269 except IndexError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
270 sample['major'] = '.'
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
271 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
272 sample['minor'] = ranked_bases[1][0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
273 sample['freq'] = ranked_bases[1][1] / float(coverage)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
274 except IndexError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
275 sample['minor'] = '.'
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
276 sample['freq'] = 0.0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
277
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
278 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
279
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
280 return site_summary
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
281
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
282
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
283 def print_site(filehandle, site, columns):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
284 """Print the output lines for one site (one per sample).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
285 filehandle must be open."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
286 for sample in site:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
287 if sample['print']:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
288 fields = [str(sample.get(column)) for column in columns]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
289 filehandle.write('\t'.join(fields)+"\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
290
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
291
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
292 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
293 """Count the number of reads for each base, and create a ranked list of
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
294 alleles passing the frequency threshold.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
295 Arguments:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
296 variant_counts: Dict of the stranded variants (keys) and their read counts (values).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
297 freq_thres: The frequency threshold each allele needs to pass to be included.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
298 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
299 variants: A list of the variants of interest. Other types of variants will not
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
300 be included in the returned list. If no list is given, all variants found in
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
301 the variant_counts will be used.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
302 Return value:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
303 ranked_bases: A list of the alleles and their read counts. The elements are
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
304 tuples (base, read count). The alleles are listed in descending order of
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
305 frequency, and only those passing the threshold are included."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
306
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
307 # Get list of all variants from variant_counts list
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
308 variants = [variant[1:] for variant in variant_counts]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
309 # deduplicate via a dict
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
310 variant_dict = dict((variant, 1) for variant in variants)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
311 variants = variant_dict.keys()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
312
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
313 ranked_bases = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
314 for variant in variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
315 reads = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
316 for strand in strands:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
317 reads += variant_counts.get(strand+variant, 0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
318 ranked_bases.append((variant, reads))
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
319
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
320 # get coverage for the specified strands
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
321 coverage = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
322 for variant in variant_counts:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
323 if variant[0] in strands:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
324 coverage += variant_counts.get(variant, 0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
325 # if debug: print "strands: "+strands+', covg: '+str(coverage)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
326
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
327 if coverage < 1:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
328 return []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
329
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
330 # sort the list of alleles by read count
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
331 ranked_bases.sort(reverse=True, key=lambda base: base[1])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
332
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
333 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
334 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
335 for base in ranked_bases:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
336 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
337 str(base[1]/float(coverage)))
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
338
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
339 # remove bases below the frequency threshold
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
340 ranked_bases = [base for base in ranked_bases
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
341 if base[1]/float(coverage) >= freq_thres]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
342
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
343 return ranked_bases
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
344
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
345
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
346 def count_alleles(variant_counts, freq_thres, debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
347 """Determine how many alleles to report, based on filtering rules.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
348 The current rule determines which bases pass the frequency threshold on each
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
349 strand individually, then compares the two sets of bases. If they are the same
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
350 (regardless of order), the allele count is the number of bases. Otherwise it
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
351 is zero."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
352 allele_count = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
353
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
354 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
355 strands='+')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
356 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
357 strands='-')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
358
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
359 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
360 print '+ '+str(alleles_plus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
361 print '- '+str(alleles_minus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
362
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
363 # check if each strand reports the same set of alleles
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
364 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
365 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
366 if alleles_plus_sorted == alleles_minus_sorted:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
367 allele_count = len(alleles_plus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
368
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
369 return allele_count
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
370
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
371
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
372 def fail(message):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
373 sys.stderr.write(message+'\n')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
374 sys.exit(1)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
375
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
376 if __name__ == "__main__":
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
377 main()