22
|
1 #!/usr/bin/env python
|
|
2 '''
|
|
3 Merge SNP data from multiple VCF files into a single fasta file.
|
|
4
|
|
5 Created on 5 Oct 2015
|
|
6
|
|
7 @author: alex
|
|
8 '''
|
|
9 import argparse
|
|
10 from collections import OrderedDict
|
|
11 from collections import defaultdict
|
|
12 import glob
|
|
13 import itertools
|
|
14 import logging
|
|
15 import os
|
|
16
|
|
17 from Bio import SeqIO
|
|
18 from bintrees import FastRBTree
|
|
19
|
|
20 # Try importing the matplotlib and numpy for stats.
|
|
21 try:
|
|
22 from matplotlib import pyplot as plt
|
|
23 import numpy
|
|
24 can_stats = True
|
|
25 except ImportError:
|
|
26 can_stats = False
|
|
27
|
|
28 import vcf
|
|
29
|
|
30 from phe.variant_filters import IUPAC_CODES
|
|
31
|
|
32
|
|
33 def plot_stats(pos_stats, total_samples, plots_dir="plots", discarded={}):
|
|
34 if not os.path.exists(plots_dir):
|
|
35 os.makedirs(plots_dir)
|
|
36
|
|
37 for contig in pos_stats:
|
|
38 plt.style.use('ggplot')
|
|
39
|
|
40 x = numpy.array([pos for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
|
|
41 y = numpy.array([ float(pos_stats[contig][pos]["mut"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, []) ])
|
|
42
|
|
43 f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True)
|
|
44 f.set_size_inches(12, 15)
|
|
45 ax1.plot(x, y, 'ro')
|
|
46 ax1.set_title("Fraction of samples with SNPs")
|
|
47 plt.ylim(0, 1.1)
|
|
48
|
|
49 y = numpy.array([ float(pos_stats[contig][pos]["N"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
|
|
50 ax2.plot(x, y, 'bo')
|
|
51 ax2.set_title("Fraction of samples with Ns")
|
|
52
|
|
53 y = numpy.array([ float(pos_stats[contig][pos]["mix"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
|
|
54 ax3.plot(x, y, 'go')
|
|
55 ax3.set_title("Fraction of samples with mixed bases")
|
|
56
|
|
57 y = numpy.array([ float(pos_stats[contig][pos]["gap"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
|
|
58 ax4.plot(x, y, 'yo')
|
|
59 ax4.set_title("Fraction of samples with uncallable genotype (gap)")
|
|
60
|
|
61 contig = contig.replace("/", "-")
|
|
62 plt.savefig(os.path.join(plots_dir, "%s.png" % contig), dpi=100)
|
|
63
|
|
64 def get_mixture(record, threshold):
|
|
65 mixtures = {}
|
|
66 try:
|
|
67 if len(record.samples[0].data.AD) > 1:
|
|
68
|
|
69 total_depth = sum(record.samples[0].data.AD)
|
|
70 # Go over all combinations of touples.
|
|
71 for comb in itertools.combinations(range(0, len(record.samples[0].data.AD)), 2):
|
|
72 i = comb[0]
|
|
73 j = comb[1]
|
|
74
|
|
75 alleles = list()
|
|
76
|
|
77 if 0 in comb:
|
|
78 alleles.append(str(record.REF))
|
|
79
|
|
80 if i != 0:
|
|
81 alleles.append(str(record.ALT[i - 1]))
|
|
82 mixture = record.samples[0].data.AD[i]
|
|
83 if j != 0:
|
|
84 alleles.append(str(record.ALT[j - 1]))
|
|
85 mixture = record.samples[0].data.AD[j]
|
|
86
|
|
87 ratio = float(mixture) / total_depth
|
|
88 if ratio == 1.0:
|
|
89 logging.debug("This is only designed for mixtures! %s %s %s %s", record, ratio, record.samples[0].data.AD, record.FILTER)
|
|
90
|
|
91 if ratio not in mixtures:
|
|
92 mixtures[ratio] = []
|
|
93 mixtures[ratio].append(alleles.pop())
|
|
94
|
|
95 elif ratio >= threshold:
|
|
96 try:
|
|
97 code = IUPAC_CODES[frozenset(alleles)]
|
|
98 if ratio not in mixtures:
|
|
99 mixtures[ratio] = []
|
|
100 mixtures[ratio].append(code)
|
|
101 except KeyError:
|
|
102 logging.warn("Could not retrieve IUPAC code for %s from %s", alleles, record)
|
|
103 except AttributeError:
|
|
104 mixtures = {}
|
|
105
|
|
106 return mixtures
|
|
107
|
|
108 def print_stats(stats, pos_stats, total_vars):
|
|
109 for contig in stats:
|
|
110 for sample, info in stats[contig].items():
|
|
111 print "%s,%i,%i" % (sample, len(info.get("n_pos", [])), total_vars)
|
|
112
|
|
113 for contig in stats:
|
|
114 for pos, info in pos_stats[contig].iteritems():
|
|
115 print "%s,%i,%i,%i,%i" % (contig, pos, info.get("N", "NA"), info.get("-", "NA"), info.get("mut", "NA"))
|
|
116
|
|
117
|
|
118 def get_args():
|
|
119 args = argparse.ArgumentParser(description="Combine multiple VCFs into a single FASTA file.")
|
|
120
|
|
121 group = args.add_mutually_exclusive_group(required=True)
|
|
122 group.add_argument("--directory", "-d", help="Path to the directory with .vcf files.")
|
|
123 group.add_argument("--input", "-i", type=str, nargs='+', help="List of VCF files to process.")
|
|
124
|
|
125 args.add_argument("--out", "-o", required=True, help="Path to the output FASTA file.")
|
|
126
|
|
127 args.add_argument("--with-mixtures", type=float, help="Specify this option with a threshold to output mixtures above this threshold.")
|
|
128
|
|
129 args.add_argument("--column-Ns", type=float, help="Keeps columns with fraction of Ns above specified threshold.")
|
|
130
|
|
131 args.add_argument("--sample-Ns", type=float, help="Keeps samples with fraction of Ns above specified threshold.")
|
|
132
|
|
133 args.add_argument("--reference", type=str, help="If path to reference specified (FASTA), then whole genome will be written.")
|
|
134
|
|
135 group = args.add_mutually_exclusive_group()
|
|
136
|
|
137 group.add_argument("--include")
|
|
138 group.add_argument("--exclude")
|
|
139
|
|
140 args.add_argument("--with-stats", help="If a path is specified, then position of the outputed SNPs is stored in this file. Requires mumpy and matplotlib.")
|
|
141 args.add_argument("--plots-dir", default="plots", help="Where to write summary plots on SNPs extracted. Requires mumpy and matplotlib.")
|
|
142
|
|
143 args.add_argument("--debug", action="store_true", help="More verbose logging (default: turned off).")
|
|
144 args.add_argument("--local", action="store_true", help="Re-read the VCF instead of storing it in memory.")
|
|
145
|
|
146 return args.parse_args()
|
|
147
|
|
148 def main():
|
|
149 """
|
|
150 Process VCF files and merge them into a single fasta file.
|
|
151 """
|
|
152 args = get_args()
|
|
153
|
|
154 logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
|
155
|
|
156 contigs = list()
|
|
157
|
|
158 sample_stats = dict()
|
|
159
|
|
160 # All positions available for analysis.
|
|
161 avail_pos = dict()
|
|
162 # Stats about each position in each chromosome.
|
|
163 pos_stats = dict()
|
|
164 indel_summary = defaultdict(int)
|
|
165 # Cached version of the data.
|
|
166 vcf_data = dict()
|
|
167 mixtures = dict()
|
|
168
|
|
169 empty_tree = FastRBTree()
|
|
170
|
|
171 exclude = False
|
|
172 include = False
|
|
173
|
|
174 if args.reference:
|
|
175 ref_seq = OrderedDict()
|
|
176 with open(args.reference) as fp:
|
|
177 for record in SeqIO.parse(fp, "fasta"):
|
|
178 ref_seq[record.id] = str(record.seq)
|
|
179
|
|
180 args.reference = ref_seq
|
|
181
|
|
182 if args.exclude or args.include:
|
|
183 pos = {}
|
|
184 chr_pos = []
|
|
185 bed_file = args.include if args.include is not None else args.exclude
|
|
186
|
|
187 with open(bed_file) as fp:
|
|
188 for line in fp:
|
|
189 data = line.strip().split("\t")
|
|
190
|
|
191 chr_pos += [ (i, False,) for i in xrange(int(data[1]), int(data[2]) + 1)]
|
|
192
|
|
193 if data[0] not in pos:
|
|
194 pos[data[0]] = []
|
|
195
|
|
196 pos[data[0]] += chr_pos
|
|
197
|
|
198
|
|
199 pos = {chrom: FastRBTree(l) for chrom, l in pos.items()}
|
|
200
|
|
201 if args.include:
|
|
202 include = pos
|
|
203 else:
|
|
204 exclude = pos
|
|
205
|
|
206
|
|
207 if args.directory is not None and args.input is None:
|
|
208 args.input = glob.glob(os.path.join(args.directory, "*.filtered.vcf"))
|
|
209
|
|
210 # First pass to get the references and the positions to be analysed.
|
|
211 for vcf_in in args.input:
|
|
212 sample_name, _ = os.path.splitext(os.path.basename(vcf_in))
|
|
213 vcf_data[vcf_in] = list()
|
|
214 reader = vcf.Reader(filename=vcf_in)
|
|
215
|
|
216 for record in reader:
|
|
217 if include and include.get(record.CHROM, empty_tree).get(record.POS, True) or exclude and not exclude.get(record.CHROM, empty_tree).get(record.POS, True):
|
|
218 continue
|
|
219
|
|
220 if not args.local:
|
|
221 vcf_data[vcf_in].append(record)
|
|
222
|
|
223 if record.CHROM not in contigs:
|
|
224 contigs.append(record.CHROM)
|
|
225 avail_pos[record.CHROM] = FastRBTree()
|
|
226 mixtures[record.CHROM] = {}
|
|
227 sample_stats[record.CHROM] = {}
|
|
228
|
|
229 if sample_name not in mixtures[record.CHROM]:
|
|
230 mixtures[record.CHROM][sample_name] = FastRBTree()
|
|
231
|
|
232 if sample_name not in sample_stats[record.CHROM]:
|
|
233 sample_stats[record.CHROM][sample_name] = {}
|
|
234
|
|
235 if not record.FILTER:
|
|
236 if record.is_snp:
|
|
237 if record.POS in avail_pos[record.CHROM] and avail_pos[record.CHROM][record.POS] != record.REF:
|
|
238 logging.critical("SOMETHING IS REALLY WRONG because reference for the same position is DIFFERENT! %s in %s", record.POS, vcf_in)
|
|
239 return 2
|
|
240
|
|
241 if record.CHROM not in pos_stats:
|
|
242 pos_stats[record.CHROM] = {}
|
|
243
|
|
244 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
|
|
245 pos_stats[record.CHROM][record.POS] = {"N":0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
|
|
246
|
|
247 elif args.with_mixtures and record.is_snp:
|
|
248 mix = get_mixture(record, args.with_mixtures)
|
|
249
|
|
250 for ratio, code in mix.items():
|
|
251 for c in code:
|
|
252 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
|
|
253 if record.CHROM not in pos_stats:
|
|
254 pos_stats[record.CHROM] = {}
|
|
255 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
|
|
256
|
|
257 if sample_name not in mixtures[record.CHROM]:
|
|
258 mixtures[record.CHROM][sample_name] = FastRBTree()
|
|
259
|
|
260 mixtures[record.CHROM][sample_name].insert(record.POS, c)
|
|
261 elif not record.is_deletion and not record.is_indel:
|
|
262 if record.CHROM not in pos_stats:
|
|
263 pos_stats[record.CHROM] = {}
|
|
264 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
|
|
265 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
|
|
266 else:
|
|
267 logging.debug("Discarding %s from %s as DEL and/or INDEL", record.POS, vcf_in)
|
|
268 indel_summary[vcf_in] += 1
|
|
269 try:
|
|
270 vcf_data[vcf_in].remove(record)
|
|
271 except ValueError:
|
|
272 pass
|
|
273
|
|
274
|
|
275 all_data = { contig: {} for contig in contigs}
|
|
276 samples = []
|
|
277
|
|
278 for vcf_in in args.input:
|
|
279
|
|
280 sample_seq = ""
|
|
281 sample_name, _ = os.path.splitext(os.path.basename(vcf_in))
|
|
282 samples.append(sample_name)
|
|
283
|
|
284 # Initialise the data for this sample to be REF positions.
|
|
285 for contig in contigs:
|
|
286 all_data[contig][sample_name] = { pos: avail_pos[contig][pos] for pos in avail_pos[contig] }
|
|
287
|
|
288 # Re-read data from VCF if local is specified, otherwise get it from memory.
|
|
289 iterator = vcf.Reader(filename=vcf_in) if args.local else vcf_data[vcf_in]
|
|
290 for record in iterator:
|
|
291 # Array of filters that have been applied.
|
|
292 filters = []
|
|
293
|
|
294 # If position is our available position.
|
|
295 if avail_pos.get(record.CHROM, empty_tree).get(record.POS, False):
|
|
296 if not record.FILTER:
|
|
297 if record.is_snp:
|
|
298 if len(record.ALT) > 1:
|
|
299 logging.info("POS %s passed filters but has multiple alleles. Inserting N")
|
|
300 all_data[record.CHROM][sample_name][record.POS] = "N"
|
|
301 else:
|
|
302 all_data[record.CHROM][sample_name][record.POS] = record.ALT[0].sequence
|
|
303 pos_stats[record.CHROM][record.POS]["mut"] += 1
|
|
304 else:
|
|
305
|
|
306 # Currently we are only using first filter to call consensus.
|
|
307 extended_code = mixtures[record.CHROM][sample_name].get(record.POS, "N")
|
|
308
|
|
309 # extended_code = PHEFilterBase.call_concensus(record)
|
|
310
|
|
311 # Calculate the stats
|
|
312 if extended_code == "N":
|
|
313 pos_stats[record.CHROM][record.POS]["N"] += 1
|
|
314
|
|
315 if "n_pos" not in sample_stats[record.CHROM][sample_name]:
|
|
316 sample_stats[record.CHROM][sample_name]["n_pos"] = []
|
|
317 sample_stats[record.CHROM][sample_name]["n_pos"].append(record.POS)
|
|
318
|
|
319 elif extended_code == "-":
|
|
320 pos_stats[record.CHROM][record.POS]["-"] += 1
|
|
321 else:
|
|
322 pos_stats[record.CHROM][record.POS]["mix"] += 1
|
|
323 # print "Good mixture %s: %i (%s)" % (sample_name, record.POS, extended_code)
|
|
324 # Record if there was uncallable genoty/gap in the data.
|
|
325 if record.samples[0].data.GT == "./.":
|
|
326 pos_stats[record.CHROM][record.POS]["gap"] += 1
|
|
327
|
|
328 # Save the extended code of the SNP.
|
|
329 all_data[record.CHROM][sample_name][record.POS] = extended_code
|
|
330 del vcf_data[vcf_in]
|
|
331
|
|
332 # Output the data to the fasta file.
|
|
333 # The data is already aligned so simply output it.
|
|
334 discarded = {}
|
|
335
|
|
336 if args.reference:
|
|
337 # These should be in the same order as the order in reference.
|
|
338 contigs = args.reference.keys()
|
|
339
|
|
340 if args.sample_Ns:
|
|
341 delete_samples = []
|
|
342 for contig in contigs:
|
|
343 for sample in samples:
|
|
344
|
|
345 # Skip if the contig not in sample_stats
|
|
346 if contig not in sample_stats:
|
|
347 continue
|
|
348
|
|
349 sample_n_ratio = float(len(sample_stats[contig][sample]["n_pos"])) / len(avail_pos[contig])
|
|
350 if sample_n_ratio > args.sample_Ns:
|
|
351 for pos in sample_stats[contig][sample]["n_pos"]:
|
|
352 pos_stats[contig][pos]["N"] -= 1
|
|
353
|
|
354 logging.info("Removing %s due to high Ns in sample: %s", sample , sample_n_ratio)
|
|
355
|
|
356 delete_samples.append(sample)
|
|
357
|
|
358 samples = [sample for sample in samples if sample not in delete_samples]
|
|
359 snp_positions = []
|
|
360 with open(args.out, "w") as fp:
|
|
361
|
|
362 for sample in samples:
|
|
363 sample_seq = ""
|
|
364 for contig in contigs:
|
|
365 if contig in avail_pos:
|
|
366 if args.reference:
|
|
367 positions = xrange(1, len(args.reference[contig]) + 1)
|
|
368 else:
|
|
369 positions = avail_pos[contig].keys()
|
|
370 for pos in positions:
|
|
371 if pos in avail_pos[contig]:
|
|
372 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \
|
|
373 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:
|
|
374 sample_seq += all_data[contig][sample][pos]
|
|
375 else:
|
|
376 if contig not in discarded:
|
|
377 discarded[contig] = []
|
|
378 discarded[contig].append(pos)
|
|
379 elif args.reference:
|
|
380 sample_seq += args.reference[contig][pos - 1]
|
|
381 elif args.reference:
|
|
382 sample_seq += args.reference[contig]
|
|
383
|
|
384 fp.write(">%s\n%s\n" % (sample, sample_seq))
|
|
385 # Do the same for reference data.
|
|
386 ref_snps = ""
|
|
387
|
|
388 for contig in contigs:
|
|
389 if contig in avail_pos:
|
|
390 if args.reference:
|
|
391 positions = xrange(1, len(args.reference[contig]) + 1)
|
|
392 else:
|
|
393 positions = avail_pos[contig].keys()
|
|
394 for pos in positions:
|
|
395 if pos in avail_pos[contig]:
|
|
396 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \
|
|
397 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:
|
|
398
|
|
399 ref_snps += str(avail_pos[contig][pos])
|
|
400 snp_positions.append((contig, pos,))
|
|
401 elif args.reference:
|
|
402 ref_snps += args.reference[contig][pos - 1]
|
|
403 elif args.reference:
|
|
404 ref_snps += args.reference[contig]
|
|
405
|
|
406 fp.write(">reference\n%s\n" % ref_snps)
|
|
407
|
|
408 if can_stats and args.with_stats:
|
|
409 with open(args.with_stats, "wb") as fp:
|
|
410 fp.write("contig\tposition\tmutations\tn_frac\n")
|
|
411 for values in snp_positions:
|
|
412 fp.write("%s\t%s\t%s\t%s\n" % (values[0],
|
|
413 values[1],
|
|
414 float(pos_stats[values[0]][values[1]]["mut"]) / len(args.input),
|
|
415 float(pos_stats[values[0]][values[1]]["N"]) / len(args.input)))
|
|
416 plot_stats(pos_stats, len(samples), discarded=discarded, plots_dir=os.path.abspath(args.plots_dir))
|
|
417 # print_stats(sample_stats, pos_stats, total_vars=len(avail_pos[contig]))
|
|
418
|
|
419 total_discarded = 0
|
|
420 for _, i in discarded.items():
|
|
421 total_discarded += len(i)
|
|
422 logging.info("Discarded total of %i poor quality columns", float(total_discarded) / len(args.input))
|
|
423 logging.info("Samples with indels:")
|
|
424 for sample, count in indel_summary.iteritems():
|
|
425 logging.info("%s\t%s", sample, count)
|
|
426 return 0
|
|
427
|
|
428 if __name__ == '__main__':
|
|
429 import time
|
|
430
|
|
431 # with PyCallGraph(output=graphviz):
|
|
432 # T0 = time.time()
|
|
433 r = main()
|
|
434 # T1 = time.time()
|
|
435
|
|
436 # print "Time taken: %i" % (T1 - T0)
|
|
437 exit(r)
|