annotate vcfs2fasta.py @ 16:1d0bc21232ec draft

Uploaded
author ulfschaefer
date Wed, 16 Dec 2015 07:32:22 -0500
parents f72039c5faa4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
1 #!/usr/bin/env python
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
2 '''
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
3 Merge SNP data from multiple VCF files into a single fasta file.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
4
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
5 Created on 5 Oct 2015
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
6
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
7 @author: alex
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
8 '''
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
9 import argparse
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
10 from collections import OrderedDict
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
11 import glob
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
12 import itertools
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
13 import logging
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
14 import os
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
15
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
16 from Bio import SeqIO
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
17 from bintrees import FastRBTree
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
18
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
19 # Try importing the matplotlib and numpy for stats.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
20 try:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
21 from matplotlib import pyplot as plt
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
22 import numpy
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
23 can_stats = True
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
24 except ImportError:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
25 can_stats = False
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
26
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
27 import vcf
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
28
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
29 from phe.variant_filters import IUPAC_CODES
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
30
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
31
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
32 def plot_stats(pos_stats, total_samples, plots_dir="plots", discarded={}):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
33 if not os.path.exists(plots_dir):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
34 os.makedirs(plots_dir)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
35
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
36 for contig in pos_stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
37
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
38 plt.style.use('ggplot')
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
39
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
40 x = numpy.array([pos for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
41 y = numpy.array([ float(pos_stats[contig][pos]["mut"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, []) ])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
42
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
43 f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
44 f.set_size_inches(12, 15)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
45 ax1.plot(x, y, 'ro')
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
46 ax1.set_title("Fraction of samples with SNPs")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
47 plt.ylim(0, 1.1)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
48
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
49 y = numpy.array([ float(pos_stats[contig][pos]["N"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
50 ax2.plot(x, y, 'bo')
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
51 ax2.set_title("Fraction of samples with Ns")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
52
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
53 y = numpy.array([ float(pos_stats[contig][pos]["mix"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
54 ax3.plot(x, y, 'go')
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
55 ax3.set_title("Fraction of samples with mixed bases")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
56
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
57 y = numpy.array([ float(pos_stats[contig][pos]["gap"]) / total_samples for pos in pos_stats[contig] if pos not in discarded.get(contig, [])])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
58 ax4.plot(x, y, 'yo')
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
59 ax4.set_title("Fraction of samples with uncallable genotype (gap)")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
60
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
61 plt.savefig(os.path.join(plots_dir, "%s.png" % contig), dpi=100)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
62
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
63 def get_mixture(record, threshold):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
64 mixtures = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
65 try:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
66 if len(record.samples[0].data.AD) > 1:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
67
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
68 total_depth = sum(record.samples[0].data.AD)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
69 # Go over all combinations of touples.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
70 for comb in itertools.combinations(range(0, len(record.samples[0].data.AD)), 2):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
71 i = comb[0]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
72 j = comb[1]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
73
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
74 alleles = list()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
75
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
76 if 0 in comb:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
77 alleles.append(str(record.REF))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
78
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
79 if i != 0:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
80 alleles.append(str(record.ALT[i - 1]))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
81 mixture = record.samples[0].data.AD[i]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
82 if j != 0:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
83 alleles.append(str(record.ALT[j - 1]))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
84 mixture = record.samples[0].data.AD[j]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
85
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
86 ratio = float(mixture) / total_depth
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
87 if ratio == 1.0:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
88 logging.debug("This is only designed for mixtures! %s %s %s %s", record, ratio, record.samples[0].data.AD, record.FILTER)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
89
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
90 if ratio not in mixtures:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
91 mixtures[ratio] = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
92 mixtures[ratio].append(alleles.pop())
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
93
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
94 elif ratio >= threshold:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
95 try:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
96 code = IUPAC_CODES[frozenset(alleles)]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
97 if ratio not in mixtures:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
98 mixtures[ratio] = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
99 mixtures[ratio].append(code)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
100 except KeyError:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
101 logging.warn("Could not retrieve IUPAC code for %s from %s", alleles, record)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
102 except AttributeError:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
103 mixtures = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
104
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
105 return mixtures
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
106
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
107 def print_stats(stats, pos_stats, total_vars):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
108 for contig in stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
109 for sample, info in stats[contig].items():
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
110 print "%s,%i,%i" % (sample, len(info.get("n_pos", [])), total_vars)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
111
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
112 for contig in stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
113 for pos, info in pos_stats[contig].iteritems():
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
114 print "%s,%i,%i,%i,%i" % (contig, pos, info.get("N", "NA"), info.get("-", "NA"), info.get("mut", "NA"))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
115
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
116
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
117 def get_args():
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
118 args = argparse.ArgumentParser(description="Combine multiple VCFs into a single FASTA file.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
119
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
120 group = args.add_mutually_exclusive_group(required=True)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
121 group.add_argument("--directory", "-d", help="Path to the directory with .vcf files.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
122 group.add_argument("--input", "-i", type=str, nargs='+', help="List of VCF files to process.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
123
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
124 args.add_argument("--out", "-o", required=True, help="Path to the output FASTA file.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
125
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
126 args.add_argument("--with-mixtures", type=float, help="Specify this option with a threshold to output mixtures above this threshold.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
127
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
128 args.add_argument("--column-Ns", type=float, help="Keeps columns with fraction of Ns above specified threshold.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
129
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
130 args.add_argument("--sample-Ns", type=float, help="Keeps samples with fraction of Ns above specified threshold.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
131
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
132 args.add_argument("--reference", type=str, help="If path to reference specified (FASTA), then whole genome will be written.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
133
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
134 group = args.add_mutually_exclusive_group()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
135
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
136 group.add_argument("--include")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
137 group.add_argument("--exclude")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
138
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
139 args.add_argument("--with-stats", help="If a path is specified, then position of the outputed SNPs is stored in this file. Requires mumpy and matplotlib.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
140 args.add_argument("--plots-dir", default="plots", help="Where to write summary plots on SNPs extracted. Requires mumpy and matplotlib.")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
141
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
142 return args.parse_args()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
143
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
144 def main():
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
145 """
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
146 Process VCF files and merge them into a single fasta file.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
147 """
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
148
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
149 logging.basicConfig(level=logging.INFO)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
150
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
151 args = get_args()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
152 contigs = list()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
153
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
154 sample_stats = dict()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
155
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
156 # All positions available for analysis.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
157 avail_pos = dict()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
158 # Stats about each position in each chromosome.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
159 pos_stats = dict()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
160 # Cached version of the data.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
161 vcf_data = dict()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
162 mixtures = dict()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
163
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
164 empty_tree = FastRBTree()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
165
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
166 exclude = False
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
167 include = False
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
168
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
169 if args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
170 ref_seq = OrderedDict()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
171 with open(args.reference) as fp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
172 for record in SeqIO.parse(fp, "fasta"):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
173 ref_seq[record.id] = str(record.seq)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
174
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
175 args.reference = ref_seq
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
176
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
177 if args.exclude or args.include:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
178 pos = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
179 chr_pos = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
180 bed_file = args.include if args.include is not None else args.exclude
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
181
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
182 with open(bed_file) as fp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
183 for line in fp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
184 data = line.strip().split("\t")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
185
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
186 chr_pos += [ (i, False,) for i in xrange(int(data[1]), int(data[2]) + 1)]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
187
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
188 if data[0] not in pos:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
189 pos[data[0]] = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
190
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
191 pos[data[0]] += chr_pos
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
192
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
193
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
194 pos = {chrom: FastRBTree(l) for chrom, l in pos.items()}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
195
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
196 if args.include:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
197 include = pos
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
198 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
199 exclude = pos
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
200
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
201
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
202 if args.directory is not None and args.input is None:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
203 args.input = glob.glob(os.path.join(args.directory, "*.vcf"))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
204
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
205 # First pass to get the references and the positions to be analysed.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
206 for vcf_in in args.input:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
207 sample_name, _ = os.path.splitext(os.path.basename(vcf_in))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
208 vcf_data[vcf_in] = list()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
209 reader = vcf.Reader(filename=vcf_in)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
210
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
211 for record in reader:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
212 if include and include.get(record.CHROM, empty_tree).get(record.POS, True) or exclude and not exclude.get(record.CHROM, empty_tree).get(record.POS, True):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
213 continue
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
214
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
215 vcf_data[vcf_in].append(record)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
216
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
217 if record.CHROM not in contigs:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
218 contigs.append(record.CHROM)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
219 avail_pos[record.CHROM] = FastRBTree()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
220 mixtures[record.CHROM] = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
221 sample_stats[record.CHROM] = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
222
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
223 if sample_name not in mixtures[record.CHROM]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
224 mixtures[record.CHROM][sample_name] = FastRBTree()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
225
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
226 if sample_name not in sample_stats[record.CHROM]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
227 sample_stats[record.CHROM][sample_name] = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
228
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
229 if not record.FILTER:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
230 if record.is_snp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
231 if record.POS in avail_pos[record.CHROM] and avail_pos[record.CHROM][record.POS] != record.REF:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
232 logging.critical("SOMETHING IS REALLY WRONG because reference for the same position is DIFFERENT! %s", record.POS)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
233 return 2
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
234
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
235 if record.CHROM not in pos_stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
236 pos_stats[record.CHROM] = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
237
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
238 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
239 pos_stats[record.CHROM][record.POS] = {"N":0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
240
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
241 elif args.with_mixtures and record.is_snp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
242 mix = get_mixture(record, args.with_mixtures)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
243
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
244 for ratio, code in mix.items():
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
245 for c in code:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
246 avail_pos[record.CHROM].insert(record.POS, str(record.REF))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
247 if record.CHROM not in pos_stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
248 pos_stats[record.CHROM] = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
249 pos_stats[record.CHROM][record.POS] = {"N": 0, "-": 0, "mut": 0, "mix": 0, "gap": 0}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
250
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
251 if sample_name not in mixtures[record.CHROM]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
252 mixtures[record.CHROM][sample_name] = FastRBTree()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
253
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
254 mixtures[record.CHROM][sample_name].insert(record.POS, c)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
255
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
256
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
257 all_data = { contig: {} for contig in contigs}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
258 samples = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
259
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
260 for vcf_in in args.input:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
261
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
262 sample_seq = ""
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
263 sample_name, _ = os.path.splitext(os.path.basename(vcf_in))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
264 samples.append(sample_name)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
265
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
266 # Initialise the data for this sample to be REF positions.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
267 for contig in contigs:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
268 all_data[contig][sample_name] = { pos: avail_pos[contig][pos] for pos in avail_pos[contig] }
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
269
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
270 # reader = vcf.Reader(filename=vcf_in)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
271 for record in vcf_data[vcf_in]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
272 # Array of filters that have been applied.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
273 filters = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
274
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
275 # If position is our available position.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
276 if avail_pos.get(record.CHROM, empty_tree).get(record.POS, False):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
277 if record.FILTER == "PASS" or not record.FILTER:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
278 if record.is_snp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
279 if len(record.ALT) > 1:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
280 logging.info("POS %s passed filters but has multiple alleles. Inserting N")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
281 all_data[record.CHROM][sample_name][record.POS] = "N"
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
282 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
283 all_data[record.CHROM][sample_name][record.POS] = record.ALT[0].sequence
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
284 pos_stats[record.CHROM][record.POS]["mut"] += 1
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
285 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
286
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
287 # Currently we are only using first filter to call consensus.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
288 extended_code = mixtures[record.CHROM][sample_name].get(record.POS, "N")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
289
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
290 # extended_code = PHEFilterBase.call_concensus(record)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
291
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
292 # Calculate the stats
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
293 if extended_code == "N":
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
294 pos_stats[record.CHROM][record.POS]["N"] += 1
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
295
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
296 if "n_pos" not in sample_stats[record.CHROM][sample_name]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
297 sample_stats[record.CHROM][sample_name]["n_pos"] = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
298 sample_stats[record.CHROM][sample_name]["n_pos"].append(record.POS)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
299
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
300 elif extended_code == "-":
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
301 pos_stats[record.CHROM][record.POS]["-"] += 1
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
302 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
303 pos_stats[record.CHROM][record.POS]["mix"] += 1
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
304 # print "Good mixture %s: %i (%s)" % (sample_name, record.POS, extended_code)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
305 # Record if there was uncallable genoty/gap in the data.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
306 if record.samples[0].data.GT == "./.":
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
307 pos_stats[record.CHROM][record.POS]["gap"] += 1
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
308
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
309 # Save the extended code of the SNP.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
310 all_data[record.CHROM][sample_name][record.POS] = extended_code
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
311 del vcf_data[vcf_in]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
312
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
313 # Output the data to the fasta file.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
314 # The data is already aligned so simply output it.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
315 discarded = {}
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
316
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
317 if args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
318 # These should be in the same order as the order in reference.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
319 contigs = args.reference.keys()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
320
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
321 if args.sample_Ns:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
322 delete_samples = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
323 for contig in contigs:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
324 for sample in samples:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
325
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
326 # Skip if the contig not in sample_stats
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
327 if contig not in sample_stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
328 continue
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
329
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
330 sample_n_ratio = float(len(sample_stats[contig][sample]["n_pos"])) / len(avail_pos[contig])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
331 if sample_n_ratio > args.sample_Ns:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
332 for pos in sample_stats[contig][sample]["n_pos"]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
333 pos_stats[contig][pos]["N"] -= 1
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
334
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
335 logging.info("Removing %s due to high Ns in sample: %s", sample , sample_n_ratio)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
336
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
337 delete_samples.append(sample)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
338
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
339 samples = [sample for sample in samples if sample not in delete_samples]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
340 snp_positions = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
341 with open(args.out, "w") as fp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
342
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
343 for sample in samples:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
344 sample_seq = ""
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
345 for contig in contigs:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
346 if contig in avail_pos:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
347 if args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
348 positions = xrange(1, len(args.reference[contig]) + 1)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
349 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
350 positions = avail_pos[contig].keys()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
351 for pos in positions:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
352 if pos in avail_pos[contig]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
353 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
354 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
355 sample_seq += all_data[contig][sample][pos]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
356 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
357 if contig not in discarded:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
358 discarded[contig] = []
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
359 discarded[contig].append(pos)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
360 elif args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
361 sample_seq += args.reference[contig][pos - 1]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
362 elif args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
363 sample_seq += args.reference[contig]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
364
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
365 fp.write(">%s\n%s\n" % (sample, sample_seq))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
366 # Do the same for reference data.
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
367 ref_snps = ""
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
368
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
369 for contig in contigs:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
370 if contig in avail_pos:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
371 if args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
372 positions = xrange(1, len(args.reference[contig]) + 1)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
373 else:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
374 positions = avail_pos[contig].keys()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
375 for pos in positions:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
376 if pos in avail_pos[contig]:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
377 if not args.column_Ns or float(pos_stats[contig][pos]["N"]) / len(samples) < args.column_Ns and \
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
378 float(pos_stats[contig][pos]["-"]) / len(samples) < args.column_Ns:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
379
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
380 ref_snps += str(avail_pos[contig][pos])
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
381 snp_positions.append((contig, pos,))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
382 elif args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
383 ref_snps += args.reference[contig][pos - 1]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
384 elif args.reference:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
385 ref_snps += args.reference[contig]
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
386
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
387 fp.write(">reference\n%s\n" % ref_snps)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
388
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
389 if can_stats and args.with_stats:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
390 with open(args.with_stats, "wb") as fp:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
391 fp.write("contig\tposition\tmutations\tn_frac\n")
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
392 for values in snp_positions:
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
393 fp.write("%s\t%s\t%s\t%s\n" % (values[0],
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
394 values[1],
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
395 float(pos_stats[values[0]][values[1]]["mut"]) / len(args.input),
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
396 float(pos_stats[values[0]][values[1]]["N"]) / len(args.input)))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
397 plot_stats(pos_stats, len(samples), discarded=discarded, plots_dir=os.path.abspath(args.plots_dir))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
398 # print_stats(sample_stats, pos_stats, total_vars=len(avail_pos[contig]))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
399
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
400 total_discarded = 0
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
401 for _, i in discarded.items():
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
402 total_discarded += len(i)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
403 logging.info("Discarded total of %i poor quality columns", float(total_discarded) / len(args.input))
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
404 return 0
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
405
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
406 if __name__ == '__main__':
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
407 import time
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
408
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
409 # with PyCallGraph(output=graphviz):
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
410 # T0 = time.time()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
411 r = main()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
412 # T1 = time.time()
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
413
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
414 # print "Time taken: %i" % (T1 - T0)
f72039c5faa4 Uploaded
ulfschaefer
parents:
diff changeset
415 exit(r)