0
|
1 import sys
|
|
2 import pysam
|
|
3 import logging
|
|
4 import struct
|
|
5 from random import sample
|
|
6 from collections import defaultdict
|
|
7 import os
|
|
8 import gzip
|
|
9
|
|
10 def checkIfGzip(filename):
|
|
11 try:
|
|
12 gf = gzip.open(filename)
|
|
13 gl = gf.readline()
|
|
14 gf.close()
|
|
15 vcf_check = b'##fileformat=VCF'
|
|
16 if gl[0:3] == b'BCF':
|
|
17 return 'bcf'
|
|
18 elif gl[:len(vcf_check)] == vcf_check:
|
|
19 return checkHeader(filename)
|
|
20 else:
|
|
21 return 'other'
|
|
22 except:
|
|
23 return 'nozip'
|
|
24
|
|
25 def checkHeader(filename):
|
|
26 f = open(filename,'rb')
|
|
27 l = f.readline()
|
|
28 f.close()
|
|
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
|
|
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
|
|
31 GZF_HEADER=b'\x1f\x8b'
|
|
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER:
|
|
33 return 'bgzip'
|
|
34 if l[:len(GZF_HEADER)] == GZF_HEADER:
|
|
35 return 'gzip'
|
|
36 return 'nozip'
|
|
37
|
|
38 def checkFormat(vcfname):
|
|
39 """Checks header of given file for compression type
|
|
40
|
|
41
|
|
42 Given a filename, opens file and reads first line to check if
|
|
43 file has BGZF or GZIP header. May be extended to check for BCF format
|
|
44
|
|
45 Parameters
|
|
46 ----------
|
|
47 filename : str
|
|
48 Name of file to be checked
|
|
49
|
|
50 Returns
|
|
51 -------
|
|
52 extension : str {'bgzip','gzip','vcf','other'}
|
|
53 File extension as indicated by header
|
|
54
|
|
55 """
|
|
56 typ = checkIfGzip(vcfname)
|
|
57 if typ != 'nozip':
|
|
58 return typ
|
|
59 f = open(vcfname)
|
|
60 l = f.readline()
|
|
61 f.close()
|
|
62 VCF_TAG='##fileformat=VCF'
|
|
63 if l[:len(VCF_TAG)] == VCF_TAG:
|
|
64 return 'vcf'
|
|
65 return 'other'
|
|
66
|
|
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
|
|
68 dr = None
|
|
69 pos = record.pos
|
|
70 c = record.chrom
|
|
71 if record.alts is None:
|
|
72 return False
|
|
73 if add_chr:
|
|
74 c = 'chr'+record.chrom
|
|
75 if record.ref == 'C' and 'T' in record.alts:
|
|
76 seq = fasta_ref.fetch(c,pos-1,pos+1)
|
|
77 if seq[0].upper() != 'C':
|
|
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
|
|
79 #raise Exception("checkIfCpG function not lining up properly")
|
|
80 if seq[1].upper() == 'G':
|
|
81 return True
|
|
82 return False
|
|
83 elif record.ref == 'G' and 'A' in record.alts:
|
|
84 seq = fasta_ref.fetch(c,pos-2,pos)
|
|
85 if seq[1].upper() != 'G':
|
|
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
|
|
87 #raise Exception("checkIfCpg function not lining up on negative strand")
|
|
88 if seq[0].upper() == 'C':
|
|
89 return True
|
|
90 return False
|
|
91 return False
|
|
92
|
|
93 def checkForDuplicates(rec_list,pass_list):
|
|
94 for i in range(len(rec_list)-1):
|
|
95 if rec_list[i].pos == rec_list[i+1].pos:
|
|
96 pass_list[i] = False
|
|
97 pass_list[i+1] = False
|
|
98
|
|
99 def checkForMultiallele(rec_list,pass_list):
|
|
100 for i in range(len(rec_list)):
|
|
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
|
|
102 pass_list[i] = False
|
|
103 pass_list[i+1] = False
|
|
104 if len(rec_list[i].alleles) > 2:
|
|
105 pass_list[i] = False
|
|
106
|
|
107 def flipChrom(chrom):
|
|
108 if chrom[0:3] == 'chr':
|
|
109 return chrom[0:3]
|
|
110 return 'chr'+chrom
|
|
111
|
|
112 def getAlleleCountDict(rec):
|
|
113 alleles = defaultdict(int)
|
|
114 total_sites = 0
|
|
115 missing_inds = 0
|
|
116 for j in range(len(rec.samples)):
|
|
117 samp = rec.samples[j]
|
|
118 if None in samp.alleles:
|
|
119 missing_inds += 1
|
|
120 for k in range(len(samp.alleles)):
|
|
121 b = samp.alleles[k]
|
|
122 if b is not None:
|
|
123 alleles[b] += 1
|
|
124 total_sites+=1
|
|
125 return alleles, total_sites, missing_inds
|
|
126
|
|
127 def isInformative(rec, mincount=2, alleles=None):
|
|
128 count = 0
|
|
129 if alleles is None:
|
|
130 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
|
|
131 if len(alleles) != 2:
|
|
132 return False
|
|
133 i1,i2 = alleles.keys()
|
|
134 return (alleles[i1] >= mincount and alleles[i2] >= mincount)
|
|
135
|
|
136 def getPassSites(record_list, remove_cpg=False, remove_indels=True,
|
|
137 remove_multiallele=True, remove_missing=0,
|
|
138 inform_level=2, fasta_ref=None):
|
|
139 pass_list = [True for r in record_list]
|
|
140 if remove_cpg == True and fasta_ref is None:
|
|
141 raise Exception("CpG removal requires a reference")
|
|
142 if inform_level > 2 or inform_level < 0:
|
|
143 raise Exception("Inform level %d must be between 0 and 2" % inform_level)
|
|
144 if remove_multiallele:
|
|
145 checkForMultiallele(record_list,pass_list)
|
|
146 for i in range(len(record_list)):
|
|
147 rec = record_list[i]
|
|
148 logging.info(rec.pos)
|
|
149 if remove_indels and not checkRecordIsSnp(rec):
|
|
150 pass_list[i] = False
|
|
151 if remove_cpg and checkIfCpG(rec,fasta_ref):
|
|
152 pass_list[i] = False
|
|
153 alleles,total_sites,missing_inds = getAlleleCountDict(rec)
|
|
154 if remove_missing != -1 and missing_inds > remove_missing:
|
|
155 pass_list[i] = False
|
|
156 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
|
|
157 pass_list[i] = False
|
|
158 #pp = zip([r.pos for r in record_list],pass_list)
|
|
159 #for ppp in pp:
|
|
160 # logging.info(ppp)
|
|
161 return pass_list
|
|
162
|
|
163
|
|
164 def filterSites(record_list, remove_cpg=False, remove_indels=True,
|
|
165 remove_multiallele=True, remove_missing=0, inform_level=2,
|
|
166 fasta_ref=None):
|
|
167 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
|
|
168 out_list = []
|
|
169 for i in range(len(pass_list)):
|
|
170 if pass_list[i]:
|
|
171 out_list.append(record_list[i])
|
|
172 return out_list
|
|
173
|
|
174
|
|
175
|
|
176 class VcfReader():
|
|
177 def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
|
|
178 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
|
|
179
|
|
180 ext = checkFormat(vcfname)
|
|
181 if ext in ['gzip','other'] :
|
|
182 raise Exception(('Input file %s is gzip-formatted, must be either '
|
|
183 'uncompressed or zipped with bgzip' % vcfname))
|
|
184 self.file_uncompressed = (ext == 'vcf')
|
|
185 self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
|
|
186 self.popmodel = None
|
|
187 self.popkeys = None
|
|
188 if popmodel is not None and use_allpop:
|
|
189 raise Exception("Popmodel and allpop cannot both be specified")
|
|
190 if compress_flag and file_uncompressed:
|
|
191 vcfname = compressVcf(vcfname)
|
|
192 if subsamp_num is not None:
|
|
193 if subsamp_list is not None:
|
|
194 raise Exception('Multiple subsampling options called in getVcfReader')
|
|
195 subsamp_list = getSubsampleList(vcfname, subsamp_num)
|
|
196 elif subsamp_fn is not None:
|
|
197 if subsamp_list is not None:
|
|
198 raise Exception('Multiple subsampling options called in getVcfReader')
|
|
199 subsamp_file = open(subsamp_fn,'r')
|
|
200 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
|
|
201 subsamp_file.close()
|
|
202
|
|
203 if index is None:
|
|
204 self.reader = pysam.VariantFile(vcfname)
|
|
205 else:
|
|
206 self.reader = pysam.VariantFile(vcfname, index_filename=index)
|
|
207 if popmodel is not None:
|
|
208 self.popmodel = popmodel
|
|
209 popsamp_list = popmodel.inds
|
|
210 self.reader.subset_samples(popsamp_list)
|
|
211 self.setPopIdx()
|
|
212 if use_allpop:
|
|
213 self.setAllPop()
|
|
214 if subsamp_list is not None:
|
|
215 logging.debug('Subsampling %d individuals from VCF file' %
|
|
216 (len(subsamp_list)))
|
|
217 self.reader.subset_samples(subsamp_list)
|
|
218 self.prev_last_rec = next(self.reader)
|
|
219 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
|
|
220
|
|
221 def fetch(self, chrom=None, start=None, end=None):
|
|
222 return self.reader.fetch(chrom, start, end)
|
|
223
|
|
224 def getRecordList(self, region=None, chrom=None, start=None,
|
|
225 end=None):
|
|
226 if self.reader_uncompressed:
|
|
227 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
|
|
228 return ret
|
|
229 else:
|
|
230 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
|
|
231
|
|
232 def setPopIdx(self):
|
|
233 self.popkeys = {}
|
|
234 sample_names = [l for l in self.reader.header.samples]
|
|
235 for p in self.popmodel.pop_list:
|
|
236 self.popkeys[p] = []
|
|
237 for ind in self.popmodel.ind_dict[p]:
|
|
238 self.popkeys[p].append(sample_names.index(ind))
|
|
239
|
|
240 def close(self):
|
|
241 self.reader.close()
|
|
242
|
|
243 def setAllPop(self):
|
|
244 self.popkeys = {'ALL':[]}
|
|
245 for i in range(len(self.reader.header.samples)):
|
|
246 self.popkeys['ALL'].append(i)
|
|
247
|
|
248 def modChrom(c,vcf_chr):
|
|
249 if c is None:
|
|
250 return None
|
|
251 if vcf_chr and c[:3] != 'chr':
|
|
252 return 'chr'+c
|
|
253 if not vcf_chr and c[:3] == 'chr':
|
|
254 return c[3:]
|
|
255 return c
|
|
256
|
|
257 def getRecordList(vcf_reader, region=None, chrom=None, start=None,
|
|
258 end=None, add_chr=False):
|
|
259 """Returns list for use in subsampling from input file"""
|
|
260 if region is not None:
|
|
261 c = modChrom(region.chrom,add_chr)
|
|
262 var_sites = vcf_reader.fetch(c, region.start, region.end)
|
|
263 else:
|
|
264 c = modChrom(chrom,add_chr)
|
|
265 var_sites = vcf_reader.fetch(c, start, end)
|
|
266 lst = []
|
|
267 for rec in var_sites:
|
|
268 lst.append(rec)
|
|
269 return lst
|
|
270
|
|
271
|
|
272 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
|
|
273 start=None, end=None, add_chr=False):
|
|
274 """Method for getting record list from unzipped VCF file.
|
|
275
|
|
276 This method will sequentially look through a VCF file until it finds
|
|
277 the given `start` position on `chrom`, then add all records to a list
|
|
278 until the `end` position has been reached. Note that `prev_last_rec`
|
|
279 must be kept track of externally to ensure that if consecutive regions
|
|
280 are called, the record of the first variant outside the first region
|
|
281 is not lost.
|
|
282
|
|
283 Parameters
|
|
284 ----------
|
|
285 vcf_reader : pysam VariantFile object
|
|
286 VCF reader initialized from other function
|
|
287 region : region object
|
|
288 Region object with start and end coordinates of region of interest.
|
|
289 prev_last_rec : VariantRecord object
|
|
290 Variable with last record read from VcfReader. Stored here so that
|
|
291 if two adjacent regions are called, the overflow record from the
|
|
292 first region is still included in the next region
|
|
293
|
|
294 Returns
|
|
295 -------
|
|
296 lst : list
|
|
297 List of records in given gene region
|
|
298 prev_last_rec : VariantRecord object
|
|
299 First record after target region, for use in next call
|
|
300
|
|
301 """
|
|
302 lst = []
|
|
303 if region is None:
|
|
304 lst.append(prev_last_rec)
|
|
305 for rec in vcf_reader:
|
|
306 lst.append(rec)
|
|
307 return lst, lst[-1]
|
|
308
|
|
309 if (prev_last_rec is not None and
|
|
310 region.containsRecord(prev_last_rec) == 'in'):
|
|
311 lst.append(prev_last_rec)
|
|
312 elif (prev_last_rec is not None and
|
|
313 region.containsRecord(prev_last_rec) == 'after'):
|
|
314 return []
|
|
315 rec = next(vcf_reader,None)
|
|
316 if rec is None:
|
|
317 return lst,None
|
|
318 place = region.containsRecord(rec)
|
|
319 while rec is not None and place != 'after':
|
|
320 if place == 'in':
|
|
321 lst.append(rec)
|
|
322 rec = next(vcf_reader,None)
|
|
323 if rec is None:
|
|
324 break
|
|
325 place = region.containsRecord(rec)
|
|
326 prev_last_rec = rec
|
|
327 return lst, prev_last_rec
|
|
328
|
|
329
|
|
330 def checkRecordIsSnp(rec):
|
|
331 """Checks if this record is a single nucleotide variant, returns bool."""
|
|
332 if len(rec.ref) != 1:
|
|
333 return False
|
|
334 if rec.alts is None:
|
|
335 return False
|
|
336 for allele in rec.alts:
|
|
337 if len(allele) != 1:
|
|
338 return False
|
|
339 return True
|
|
340
|
|
341
|
|
342 def getSubsampleList(vcfname, ss_count):
|
|
343 """Returns a list of the first `ss_count` individuals in `vcfname`
|
|
344
|
|
345 """
|
|
346
|
|
347 vcf_o = pysam.VariantFile(vcfname)
|
|
348 rec = next(vcf_o)
|
|
349 vcf_o.close()
|
|
350 lst = []
|
|
351 for samp in rec.samples:
|
|
352 lst.append(samp)
|
|
353 return lst[:int(ss_count)]
|
|
354
|
|
355
|
|
356 def compressVcf(vcfname,forceflag=False,remove=False):
|
|
357 """Runs bgzip and tabix on input VCF file.
|
|
358
|
|
359 Using the pysam library, this function runs the bgzip and tabix utilities
|
|
360 on the given input file. By default, this will not overwrite an existing
|
|
361 zipped file, but will overwrite an existing index. `remove` can be set to
|
|
362 delete the unzipped file.
|
|
363
|
|
364 Parameters
|
|
365 ----------
|
|
366 vcfname : str
|
|
367 Name of uncompressed VCF file
|
|
368 forceflag : bool (False)
|
|
369 If true, will overwrite (vcfname).gz if it exists
|
|
370 remove : bool (False)
|
|
371 If true, will delete uncompressed source file
|
|
372
|
|
373 Returns
|
|
374 -------
|
|
375 cvcfname : str
|
|
376 Filepath to compressed VCF file
|
|
377 """
|
|
378 cvcfname = vcfname+".gz"
|
|
379 pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
|
|
380 pysam.tabix_index(cvcfname,preset="vcf",force=True)
|
|
381 if remove:
|
|
382 os.remove(vcfname)
|
|
383 return cvcfname
|
|
384
|
|
385 def vcfRegionName(prefix, region, ext, oneidx=False,
|
|
386 halfopen=True, sep='-'):
|
|
387 chrom = region.toStr(halfopen, oneidx, sep)
|
|
388 return prefix+'_'+chrom+'.'+ext
|
|
389
|
|
390 def getRecordsInRegion(region, record_list):
|
|
391 sub_list = []
|
|
392 for i in range(len(record_list)):
|
|
393 loc = region.containsRecord(record_list[i])
|
|
394 if loc == "in":
|
|
395 sub_list.append(record_list[i])
|
|
396 elif loc == "after":
|
|
397 break
|
|
398 return sub_list
|
|
399
|
|
400
|
|
401
|
|
402
|
|
403
|
|
404 #def getVcfReader(args):
|
|
405 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
|
|
406 subsamp_fn=None, subsamp_list=None, index=None):
|
|
407 """Returns a reader for a given input VCF file.
|
|
408
|
|
409 Given a filename, filetype, compression option, and optional Subsampling
|
|
410 options, will return a pysam.VariantFile object for iteration and
|
|
411 a flag as to whether this file is compressed or uncompressed.
|
|
412
|
|
413 Parameters
|
|
414 ----------
|
|
415 vcfname : str
|
|
416 Filename for VCF file. The extension of this file will be used to
|
|
417 determine whether it is compressed or not unless `var_ext` is set.
|
|
418 var_ext : str (None)
|
|
419 Extension for VCF file if it is not included in the filename.
|
|
420 compress_flag : bool (False)
|
|
421 If filetype is uncompressed and this is set to true, will run
|
|
422 compressVcf function.
|
|
423 subsamp_num : int (None)
|
|
424 If set, will randomly select `subsamp_num` individuals (not
|
|
425 genotypes) from the input VCF file and return a reader with
|
|
426 only those data.
|
|
427 subsamp_fn : str (None)
|
|
428 If set, will return a reader with only data from the samples listed
|
|
429 in the file provided. Cannot be used with other subsampling options.
|
|
430 subsamp_list : list (None)
|
|
431 If set, will return reader with records containing only
|
|
432 individuals named in the list. Cannot be used with other subsampling
|
|
433 options.
|
|
434
|
|
435 Returns
|
|
436 -------
|
|
437 vcf_reader : pysam.VariantFile
|
|
438 A reader that can be iterated through for variant records. If
|
|
439 compressed, it will be able to use the pysam fetch method, otherwise
|
|
440 it must be read through sequentially
|
|
441 reader_uncompressed : bool
|
|
442 If True, VCF reader is uncompressed. This means the fetch method
|
|
443 cannot be used and region access must be done using the
|
|
444 "getRecordListUnzipped" method.
|
|
445
|
|
446 """
|
|
447 ext = checkFormat(vcfname)
|
|
448 if ext in ['gzip','other'] :
|
|
449 raise Exception(('Input file %s is gzip-formatted, must be either '
|
|
450 'uncompressed or zipped with bgzip' % vcfname))
|
|
451 file_uncompressed = (ext == 'vcf')
|
|
452 reader_uncompressed = (file_uncompressed and not compress_flag)
|
|
453 if compress_flag and file_uncompressed:
|
|
454 vcfname = compressVcf(vcfname)
|
|
455 #subsamp_list = None
|
|
456 if subsamp_num is not None:
|
|
457 if subsamp_list is not None:
|
|
458 raise Exception('Multiple subsampling options called in getVcfReader')
|
|
459 subsamp_list = getSubsampleList(vcfname, subsamp_num)
|
|
460 elif subsamp_fn is not None:
|
|
461 if subsamp_list is not None:
|
|
462 raise Exception('Multiple subsampling options called in getVcfReader')
|
|
463 subsamp_file = open(subsamp_fn,'r')
|
|
464 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
|
|
465 subsamp_file.close()
|
|
466 if index is None:
|
|
467 vcf_reader = pysam.VariantFile(vcfname)
|
|
468 else:
|
|
469 vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
|
|
470 if subsamp_list is not None:
|
|
471 logging.debug('Subsampling %d individuals from VCF file' %
|
|
472 (len(subsamp_list)))
|
|
473 vcf_reader.subset_samples(subsamp_list)
|
|
474 return vcf_reader, reader_uncompressed
|