comparison vcfs2fasta.py @ 22:96f393ad7fc6 draft default tip

Uploaded
author ulfschaefer
date Wed, 23 Dec 2015 04:50:58 -0500
parents
children
comparison
equal deleted inserted replaced
21:b09ffe50c378 22:96f393ad7fc6
1 #!/usr/bin/env python
2 '''
3 Merge SNP data from multiple VCF files into a single fasta file.
4
5 Created on 5 Oct 2015
6
7 @author: alex
8 '''
9 import argparse
10 from collections import OrderedDict
11 from collections import defaultdict
12 import glob
13 import itertools
14 import logging
15 import os
16
17 from Bio import SeqIO
18 from bintrees import FastRBTree
19
20 # Try importing the matplotlib and numpy for stats.
21 try:
22 from matplotlib import pyplot as plt
23 import numpy
24 can_stats = True
25 except ImportError:
26 can_stats = False
27
28 import vcf
29
30 from phe.variant_filters import IUPAC_CODES
31
32
33 def plot_stats(pos_stats, total_samples, plots_dir="plots", discarded={}):
34 if not os.path.exists(plots_dir):
35 os.makedirs(plots_dir)
36
37 for contig in pos_stats:
38 plt.style.use('ggplot')
39
40 x = numpy.array([pos for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
41 y = numpy.array([ float(pos_stats[contig][pos]["mut"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, []) ])
42
43 f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True)
44 f.set_size_inches(12, 15)
45 ax1.plot(x, y, 'ro')
46 ax1.set_title("Fraction of samples with SNPs")
47 plt.ylim(0, 1.1)
48
49 y = numpy.array([ float(pos_stats[contig][pos]["N"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
50 ax2.plot(x, y, 'bo')
51 ax2.set_title("Fraction of samples with Ns")
52
53 y = numpy.array([ float(pos_stats[contig][pos]["mix"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
54 ax3.plot(x, y, 'go')
55 ax3.set_title("Fraction of samples with mixed bases")
56
57 y = numpy.array([ float(pos_stats[contig][pos]["gap"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
58 ax4.plot(x, y, 'yo')
59 ax4.set_title("Fraction of samples with uncallable genotype (gap)")
60
61 contig = contig.replace("/", "-")
62 plt.savefig(os.path.join(plots_dir, "%s.png" % contig), dpi=100)
63
64 def get_mixture(record, threshold):
65 mixtures = {}
66 try:
67 if len(record.samples[0].data.AD) > 1:
68
69 total_depth = sum(record.samples[0].data.AD)
70 # Go over all combinations of touples.
71 for comb in itertools.combinations(range(0, len(record.samples[0].data.AD)), 2):
72 i = comb[0]
73 j = comb[1]
74
75 alleles = list()
76
77 if 0 in comb:
78 alleles.append(str(record.REF))
79
80 if i != 0:
81 alleles.append(str(record.ALT[i - 1]))
82 mixture = record.samples[0].data.AD[i]
83 if j != 0:
84 alleles.append(str(record.ALT[j - 1]))
85 mixture = record.samples[0].data.AD[j]
86
87 ratio = float(mixture) / total_depth
88 if ratio == 1.0:
89 logging.debug("This is only designed for mixtures! %s %s %s %s", record, ratio, record.samples[0].data.AD, record.FILTER)
90
91 if ratio not in mixtures:
92 mixtures[ratio] = []
93 mixtures[ratio].append(alleles.pop())
94
95 elif ratio >= threshold:
96 try:
97 code = IUPAC_CODES[frozenset(alleles)]
98 if ratio not in mixtures:
99 mixtures[ratio] = []
100 mixtures[ratio].append(code)
101 except KeyError:
102 logging.warn("Could not retrieve IUPAC code for %s from %s", alleles, record)
103 except AttributeError:
104 mixtures = {}
105
106 return mixtures
107
108 def print_stats(stats, pos_stats, total_vars):
109 for contig in stats:
110 for sample, info in stats[contig].items():
111 print "%s,%i,%i" % (sample, len(info.get("n_pos", [])), total_vars)
112
113 for contig in stats:
114 for pos, info in pos_stats[contig].iteritems():
115 print "%s,%i,%i,%i,%i" % (contig, pos, info.get("N", "NA"), info.get("-", "NA"), info.get("mut", "NA"))
116
117
118 def get_args():
119 args = argparse.ArgumentParser(description="Combine multiple VCFs into a single FASTA file.")
120
121 group = args.add_mutually_exclusive_group(required=True)
122 group.add_argument("--directory", "-d", help="Path to the directory with .vcf files.")
123 group.add_argument("--input", "-i", type=str, nargs='+', help="List of VCF files to process.")
124
125 args.add_argument("--out", "-o", required=True, help="Path to the output FASTA file.")
126
127 args.add_argument("--with-mixtures", type=float, help="Specify this option with a threshold to output mixtures above this threshold.")
128
129 args.add_argument("--column-Ns", type=float, help="Keeps columns with fraction of Ns above specified threshold.")
130
131 args.add_argument("--sample-Ns", type=float, help="Keeps samples with fraction of Ns above specified threshold.")
132
133 args.add_argument("--reference", type=str, help="If path to reference specified (FASTA), then whole genome will be written.")
134
135 group = args.add_mutually_exclusive_group()
136
137 group.add_argument("--include")
138 group.add_argument("--exclude")
139
140 args.add_argument("--with-stats", help="If a path is specified, then position of the outputed SNPs is stored in this file. Requires mumpy and matplotlib.")
141 args.add_argument("--plots-dir", default="plots", help="Where to write summary plots on SNPs extracted. Requires mumpy and matplotlib.")
142
143 args.add_argument("--debug", action="store_true", help="More verbose logging (default: turned off).")
144 args.add_argument("--local", action="store_true", help="Re-read the VCF instead of storing it in memory.")
145
146 return args.parse_args()
147
148 def main():
149 """
150 Process VCF files and merge them into a single fasta file.
151 """
152 args = get_args()
153
154 logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
155
156 contigs = list()
157
158 sample_stats = dict()
159
160 # All positions available for analysis.
161 avail_pos = dict()
162 # Stats about each position in each chromosome.
163 pos_stats = dict()
164 indel_summary = defaultdict(int)
165 # Cached version of the data.
166 vcf_data = dict()
167 mixtures = dict()
168
169 empty_tree = FastRBTree()
170
171 exclude = False
172 include = False
173
174 if args.reference:
175 ref_seq = OrderedDict()
176 with open(args.reference) as fp:
177 for record in SeqIO.parse(fp, "fasta"):
178 ref_seq[record.id] = str(record.seq)
179
180 args.reference = ref_seq
181
182 if args.exclude or args.include:
183 pos = {}
184 chr_pos = []
185 bed_file = args.include if args.include is not None else args.exclude
186
187 with open(bed_file) as fp:
188 for line in fp:
189 data = line.strip().split("\t")
190
191 chr_pos += [ (i, False,) for i in xrange(int(data[1]), int(data[2]) + 1)]
192
193 if data[0] not in pos:
194 pos[data[0]] = []
195
196 pos[data[0]] += chr_pos
197
198
199 pos = {chrom: FastRBTree(l) for chrom, l in pos.items()}
200
201 if args.include:
202 include = pos
203 else:
204 exclude = pos
205
206
207 if args.directory is not None and args.input is None:
208 args.input = glob.glob(os.path.join(args.directory, "*.filtered.vcf"))
209
210 # First pass to get the references and the positions to be analysed.
211 for vcf_in in args.input:
212 sample_name, _ = os.path.splitext(os.path.basename(vcf_in))
213 vcf_data[vcf_in] = list()
214 reader = vcf.Reader(filename=vcf_in)
215
216 for record in reader:
217 if include and include.get(record.CHROM, empty_tree).get(record.POS, True) or exclude and not exclude.get(record.CHROM, empty_tree).get(record.POS, True):
218 continue
219
220 if not args.local:
221 vcf_data[vcf_in].append(record)
222
223 if record.CHROM not in contigs:
224 contigs.append(record.CHROM)
225 avail_pos[record.CHROM] = FastRBTree()
226 mixtures[record.CHROM] = {}
227 sample_stats[record.CHROM] = {}
228
229 if sample_name not in mixtures[record.CHROM]:
230 mixtures[record.CHROM][sample_name] = FastRBTree()
231
232 if sample_name not in sample_stats[record.CHROM]:
233 sample_stats[record.CHROM][sample_name] = {}
234
235 if not record.FILTER:
236 if record.is_snp:
237 if record.POS in avail_pos[record.CHROM] and avail_pos[record.CHROM][record.POS] != record.REF:
238 logging.critical("SOMETHING IS REALLY WRONG because reference for the same position is DIFFERENT! %s in %s", record.POS, vcf_in)
239 return 2
240
241 if record.CHROM not in pos_stats:
242 pos_stats[record.CHROM] = {}
243
244 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
245 pos_stats[record.CHROM][record.POS] = {"N":0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
246
247 elif args.with_mixtures and record.is_snp:
248 mix = get_mixture(record, args.with_mixtures)
249
250 for ratio, code in mix.items():
251 for c in code:
252 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
253 if record.CHROM not in pos_stats:
254 pos_stats[record.CHROM] = {}
255 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
256
257 if sample_name not in mixtures[record.CHROM]:
258 mixtures[record.CHROM][sample_name] = FastRBTree()
259
260 mixtures[record.CHROM][sample_name].insert(record.POS, c)
261 elif not record.is_deletion and not record.is_indel:
262 if record.CHROM not in pos_stats:
263 pos_stats[record.CHROM] = {}
264 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
265 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
266 else:
267 logging.debug("Discarding %s from %s as DEL and/or INDEL", record.POS, vcf_in)
268 indel_summary[vcf_in] += 1
269 try:
270 vcf_data[vcf_in].remove(record)
271 except ValueError:
272 pass
273
274
275 all_data = { contig: {} for contig in contigs}
276 samples = []
277
278 for vcf_in in args.input:
279
280 sample_seq = ""
281 sample_name, _ = os.path.splitext(os.path.basename(vcf_in))
282 samples.append(sample_name)
283
284 # Initialise the data for this sample to be REF positions.
285 for contig in contigs:
286 all_data[contig][sample_name] = { pos: avail_pos[contig][pos] for pos in avail_pos[contig] }
287
288 # Re-read data from VCF if local is specified, otherwise get it from memory.
289 iterator = vcf.Reader(filename=vcf_in) if args.local else vcf_data[vcf_in]
290 for record in iterator:
291 # Array of filters that have been applied.
292 filters = []
293
294 # If position is our available position.
295 if avail_pos.get(record.CHROM, empty_tree).get(record.POS, False):
296 if not record.FILTER:
297 if record.is_snp:
298 if len(record.ALT) > 1:
299 logging.info("POS %s passed filters but has multiple alleles. Inserting N")
300 all_data[record.CHROM][sample_name][record.POS] = "N"
301 else:
302 all_data[record.CHROM][sample_name][record.POS] = record.ALT[0].sequence
303 pos_stats[record.CHROM][record.POS]["mut"] += 1
304 else:
305
306 # Currently we are only using first filter to call consensus.
307 extended_code = mixtures[record.CHROM][sample_name].get(record.POS, "N")
308
309 # extended_code = PHEFilterBase.call_concensus(record)
310
311 # Calculate the stats
312 if extended_code == "N":
313 pos_stats[record.CHROM][record.POS]["N"] += 1
314
315 if "n_pos" not in sample_stats[record.CHROM][sample_name]:
316 sample_stats[record.CHROM][sample_name]["n_pos"] = []
317 sample_stats[record.CHROM][sample_name]["n_pos"].append(record.POS)
318
319 elif extended_code == "-":
320 pos_stats[record.CHROM][record.POS]["-"] += 1
321 else:
322 pos_stats[record.CHROM][record.POS]["mix"] += 1
323 # print "Good mixture %s: %i (%s)" % (sample_name, record.POS, extended_code)
324 # Record if there was uncallable genoty/gap in the data.
325 if record.samples[0].data.GT == "./.":
326 pos_stats[record.CHROM][record.POS]["gap"] += 1
327
328 # Save the extended code of the SNP.
329 all_data[record.CHROM][sample_name][record.POS] = extended_code
330 del vcf_data[vcf_in]
331
332 # Output the data to the fasta file.
333 # The data is already aligned so simply output it.
334 discarded = {}
335
336 if args.reference:
337 # These should be in the same order as the order in reference.
338 contigs = args.reference.keys()
339
340 if args.sample_Ns:
341 delete_samples = []
342 for contig in contigs:
343 for sample in samples:
344
345 # Skip if the contig not in sample_stats
346 if contig not in sample_stats:
347 continue
348
349 sample_n_ratio = float(len(sample_stats[contig][sample]["n_pos"])) / len(avail_pos[contig])
350 if sample_n_ratio > args.sample_Ns:
351 for pos in sample_stats[contig][sample]["n_pos"]:
352 pos_stats[contig][pos]["N"] -= 1
353
354 logging.info("Removing %s due to high Ns in sample: %s", sample , sample_n_ratio)
355
356 delete_samples.append(sample)
357
358 samples = [sample for sample in samples if sample not in delete_samples]
359 snp_positions = []
360 with open(args.out, "w") as fp:
361
362 for sample in samples:
363 sample_seq = ""
364 for contig in contigs:
365 if contig in avail_pos:
366 if args.reference:
367 positions = xrange(1, len(args.reference[contig]) + 1)
368 else:
369 positions = avail_pos[contig].keys()
370 for pos in positions:
371 if pos in avail_pos[contig]:
372 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \
373 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:
374 sample_seq += all_data[contig][sample][pos]
375 else:
376 if contig not in discarded:
377 discarded[contig] = []
378 discarded[contig].append(pos)
379 elif args.reference:
380 sample_seq += args.reference[contig][pos - 1]
381 elif args.reference:
382 sample_seq += args.reference[contig]
383
384 fp.write(">%s\n%s\n" % (sample, sample_seq))
385 # Do the same for reference data.
386 ref_snps = ""
387
388 for contig in contigs:
389 if contig in avail_pos:
390 if args.reference:
391 positions = xrange(1, len(args.reference[contig]) + 1)
392 else:
393 positions = avail_pos[contig].keys()
394 for pos in positions:
395 if pos in avail_pos[contig]:
396 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \
397 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:
398
399 ref_snps += str(avail_pos[contig][pos])
400 snp_positions.append((contig, pos,))
401 elif args.reference:
402 ref_snps += args.reference[contig][pos - 1]
403 elif args.reference:
404 ref_snps += args.reference[contig]
405
406 fp.write(">reference\n%s\n" % ref_snps)
407
408 if can_stats and args.with_stats:
409 with open(args.with_stats, "wb") as fp:
410 fp.write("contig\tposition\tmutations\tn_frac\n")
411 for values in snp_positions:
412 fp.write("%s\t%s\t%s\t%s\n" % (values[0],
413 values[1],
414 float(pos_stats[values[0]][values[1]]["mut"]) / len(args.input),
415 float(pos_stats[values[0]][values[1]]["N"]) / len(args.input)))
416 plot_stats(pos_stats, len(samples), discarded=discarded, plots_dir=os.path.abspath(args.plots_dir))
417 # print_stats(sample_stats, pos_stats, total_vars=len(avail_pos[contig]))
418
419 total_discarded = 0
420 for _, i in discarded.items():
421 total_discarded += len(i)
422 logging.info("Discarded total of %i poor quality columns", float(total_discarded) / len(args.input))
423 logging.info("Samples with indels:")
424 for sample, count in indel_summary.iteritems():
425 logging.info("%s\t%s", sample, count)
426 return 0
427
428 if __name__ == '__main__':
429 import time
430
431 # with PyCallGraph(output=graphviz):
432 # T0 = time.time()
433 r = main()
434 # T1 = time.time()
435
436 # print "Time taken: %i" % (T1 - T0)
437 exit(r)