comparison vcf_reader_func.py @ 0:3830d29fca6a draft

Uploaded
author jaredgk
date Mon, 15 Oct 2018 18:15:47 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3830d29fca6a
1 import sys
2 import pysam
3 import logging
4 import struct
5 from random import sample
6 from collections import defaultdict
7 import os
8 import gzip
9
10 def checkIfGzip(filename):
11 try:
12 gf = gzip.open(filename)
13 gl = gf.readline()
14 gf.close()
15 vcf_check = b'##fileformat=VCF'
16 if gl[0:3] == b'BCF':
17 return 'bcf'
18 elif gl[:len(vcf_check)] == vcf_check:
19 return checkHeader(filename)
20 else:
21 return 'other'
22 except:
23 return 'nozip'
24
25 def checkHeader(filename):
26 f = open(filename,'rb')
27 l = f.readline()
28 f.close()
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
31 GZF_HEADER=b'\x1f\x8b'
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER:
33 return 'bgzip'
34 if l[:len(GZF_HEADER)] == GZF_HEADER:
35 return 'gzip'
36 return 'nozip'
37
38 def checkFormat(vcfname):
39 """Checks header of given file for compression type
40
41
42 Given a filename, opens file and reads first line to check if
43 file has BGZF or GZIP header. May be extended to check for BCF format
44
45 Parameters
46 ----------
47 filename : str
48 Name of file to be checked
49
50 Returns
51 -------
52 extension : str {'bgzip','gzip','vcf','other'}
53 File extension as indicated by header
54
55 """
56 typ = checkIfGzip(vcfname)
57 if typ != 'nozip':
58 return typ
59 f = open(vcfname)
60 l = f.readline()
61 f.close()
62 VCF_TAG='##fileformat=VCF'
63 if l[:len(VCF_TAG)] == VCF_TAG:
64 return 'vcf'
65 return 'other'
66
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
68 dr = None
69 pos = record.pos
70 c = record.chrom
71 if record.alts is None:
72 return False
73 if add_chr:
74 c = 'chr'+record.chrom
75 if record.ref == 'C' and 'T' in record.alts:
76 seq = fasta_ref.fetch(c,pos-1,pos+1)
77 if seq[0].upper() != 'C':
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
79 #raise Exception("checkIfCpG function not lining up properly")
80 if seq[1].upper() == 'G':
81 return True
82 return False
83 elif record.ref == 'G' and 'A' in record.alts:
84 seq = fasta_ref.fetch(c,pos-2,pos)
85 if seq[1].upper() != 'G':
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
87 #raise Exception("checkIfCpg function not lining up on negative strand")
88 if seq[0].upper() == 'C':
89 return True
90 return False
91 return False
92
93 def checkForDuplicates(rec_list,pass_list):
94 for i in range(len(rec_list)-1):
95 if rec_list[i].pos == rec_list[i+1].pos:
96 pass_list[i] = False
97 pass_list[i+1] = False
98
99 def checkForMultiallele(rec_list,pass_list):
100 for i in range(len(rec_list)):
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
102 pass_list[i] = False
103 pass_list[i+1] = False
104 if len(rec_list[i].alleles) > 2:
105 pass_list[i] = False
106
107 def flipChrom(chrom):
108 if chrom[0:3] == 'chr':
109 return chrom[0:3]
110 return 'chr'+chrom
111
112 def getAlleleCountDict(rec):
113 alleles = defaultdict(int)
114 total_sites = 0
115 missing_inds = 0
116 for j in range(len(rec.samples)):
117 samp = rec.samples[j]
118 if None in samp.alleles:
119 missing_inds += 1
120 for k in range(len(samp.alleles)):
121 b = samp.alleles[k]
122 if b is not None:
123 alleles[b] += 1
124 total_sites+=1
125 return alleles, total_sites, missing_inds
126
127 def isInformative(rec, mincount=2, alleles=None):
128 count = 0
129 if alleles is None:
130 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
131 if len(alleles) != 2:
132 return False
133 i1,i2 = alleles.keys()
134 return (alleles[i1] >= mincount and alleles[i2] >= mincount)
135
136 def getPassSites(record_list, remove_cpg=False, remove_indels=True,
137 remove_multiallele=True, remove_missing=0,
138 inform_level=2, fasta_ref=None):
139 pass_list = [True for r in record_list]
140 if remove_cpg == True and fasta_ref is None:
141 raise Exception("CpG removal requires a reference")
142 if inform_level > 2 or inform_level < 0:
143 raise Exception("Inform level %d must be between 0 and 2" % inform_level)
144 if remove_multiallele:
145 checkForMultiallele(record_list,pass_list)
146 for i in range(len(record_list)):
147 rec = record_list[i]
148 logging.info(rec.pos)
149 if remove_indels and not checkRecordIsSnp(rec):
150 pass_list[i] = False
151 if remove_cpg and checkIfCpG(rec,fasta_ref):
152 pass_list[i] = False
153 alleles,total_sites,missing_inds = getAlleleCountDict(rec)
154 if remove_missing != -1 and missing_inds > remove_missing:
155 pass_list[i] = False
156 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
157 pass_list[i] = False
158 #pp = zip([r.pos for r in record_list],pass_list)
159 #for ppp in pp:
160 # logging.info(ppp)
161 return pass_list
162
163
164 def filterSites(record_list, remove_cpg=False, remove_indels=True,
165 remove_multiallele=True, remove_missing=0, inform_level=2,
166 fasta_ref=None):
167 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
168 out_list = []
169 for i in range(len(pass_list)):
170 if pass_list[i]:
171 out_list.append(record_list[i])
172 return out_list
173
174
175
176 class VcfReader():
177 def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
178 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
179
180 ext = checkFormat(vcfname)
181 if ext in ['gzip','other'] :
182 raise Exception(('Input file %s is gzip-formatted, must be either '
183 'uncompressed or zipped with bgzip' % vcfname))
184 self.file_uncompressed = (ext == 'vcf')
185 self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
186 self.popmodel = None
187 self.popkeys = None
188 if popmodel is not None and use_allpop:
189 raise Exception("Popmodel and allpop cannot both be specified")
190 if compress_flag and file_uncompressed:
191 vcfname = compressVcf(vcfname)
192 if subsamp_num is not None:
193 if subsamp_list is not None:
194 raise Exception('Multiple subsampling options called in getVcfReader')
195 subsamp_list = getSubsampleList(vcfname, subsamp_num)
196 elif subsamp_fn is not None:
197 if subsamp_list is not None:
198 raise Exception('Multiple subsampling options called in getVcfReader')
199 subsamp_file = open(subsamp_fn,'r')
200 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
201 subsamp_file.close()
202
203 if index is None:
204 self.reader = pysam.VariantFile(vcfname)
205 else:
206 self.reader = pysam.VariantFile(vcfname, index_filename=index)
207 if popmodel is not None:
208 self.popmodel = popmodel
209 popsamp_list = popmodel.inds
210 self.reader.subset_samples(popsamp_list)
211 self.setPopIdx()
212 if use_allpop:
213 self.setAllPop()
214 if subsamp_list is not None:
215 logging.debug('Subsampling %d individuals from VCF file' %
216 (len(subsamp_list)))
217 self.reader.subset_samples(subsamp_list)
218 self.prev_last_rec = next(self.reader)
219 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
220
221 def fetch(self, chrom=None, start=None, end=None):
222 return self.reader.fetch(chrom, start, end)
223
224 def getRecordList(self, region=None, chrom=None, start=None,
225 end=None):
226 if self.reader_uncompressed:
227 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
228 return ret
229 else:
230 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
231
232 def setPopIdx(self):
233 self.popkeys = {}
234 sample_names = [l for l in self.reader.header.samples]
235 for p in self.popmodel.pop_list:
236 self.popkeys[p] = []
237 for ind in self.popmodel.ind_dict[p]:
238 self.popkeys[p].append(sample_names.index(ind))
239
240 def close(self):
241 self.reader.close()
242
243 def setAllPop(self):
244 self.popkeys = {'ALL':[]}
245 for i in range(len(self.reader.header.samples)):
246 self.popkeys['ALL'].append(i)
247
248 def modChrom(c,vcf_chr):
249 if c is None:
250 return None
251 if vcf_chr and c[:3] != 'chr':
252 return 'chr'+c
253 if not vcf_chr and c[:3] == 'chr':
254 return c[3:]
255 return c
256
257 def getRecordList(vcf_reader, region=None, chrom=None, start=None,
258 end=None, add_chr=False):
259 """Returns list for use in subsampling from input file"""
260 if region is not None:
261 c = modChrom(region.chrom,add_chr)
262 var_sites = vcf_reader.fetch(c, region.start, region.end)
263 else:
264 c = modChrom(chrom,add_chr)
265 var_sites = vcf_reader.fetch(c, start, end)
266 lst = []
267 for rec in var_sites:
268 lst.append(rec)
269 return lst
270
271
272 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
273 start=None, end=None, add_chr=False):
274 """Method for getting record list from unzipped VCF file.
275
276 This method will sequentially look through a VCF file until it finds
277 the given `start` position on `chrom`, then add all records to a list
278 until the `end` position has been reached. Note that `prev_last_rec`
279 must be kept track of externally to ensure that if consecutive regions
280 are called, the record of the first variant outside the first region
281 is not lost.
282
283 Parameters
284 ----------
285 vcf_reader : pysam VariantFile object
286 VCF reader initialized from other function
287 region : region object
288 Region object with start and end coordinates of region of interest.
289 prev_last_rec : VariantRecord object
290 Variable with last record read from VcfReader. Stored here so that
291 if two adjacent regions are called, the overflow record from the
292 first region is still included in the next region
293
294 Returns
295 -------
296 lst : list
297 List of records in given gene region
298 prev_last_rec : VariantRecord object
299 First record after target region, for use in next call
300
301 """
302 lst = []
303 if region is None:
304 lst.append(prev_last_rec)
305 for rec in vcf_reader:
306 lst.append(rec)
307 return lst, lst[-1]
308
309 if (prev_last_rec is not None and
310 region.containsRecord(prev_last_rec) == 'in'):
311 lst.append(prev_last_rec)
312 elif (prev_last_rec is not None and
313 region.containsRecord(prev_last_rec) == 'after'):
314 return []
315 rec = next(vcf_reader,None)
316 if rec is None:
317 return lst,None
318 place = region.containsRecord(rec)
319 while rec is not None and place != 'after':
320 if place == 'in':
321 lst.append(rec)
322 rec = next(vcf_reader,None)
323 if rec is None:
324 break
325 place = region.containsRecord(rec)
326 prev_last_rec = rec
327 return lst, prev_last_rec
328
329
330 def checkRecordIsSnp(rec):
331 """Checks if this record is a single nucleotide variant, returns bool."""
332 if len(rec.ref) != 1:
333 return False
334 if rec.alts is None:
335 return False
336 for allele in rec.alts:
337 if len(allele) != 1:
338 return False
339 return True
340
341
342 def getSubsampleList(vcfname, ss_count):
343 """Returns a list of the first `ss_count` individuals in `vcfname`
344
345 """
346
347 vcf_o = pysam.VariantFile(vcfname)
348 rec = next(vcf_o)
349 vcf_o.close()
350 lst = []
351 for samp in rec.samples:
352 lst.append(samp)
353 return lst[:int(ss_count)]
354
355
356 def compressVcf(vcfname,forceflag=False,remove=False):
357 """Runs bgzip and tabix on input VCF file.
358
359 Using the pysam library, this function runs the bgzip and tabix utilities
360 on the given input file. By default, this will not overwrite an existing
361 zipped file, but will overwrite an existing index. `remove` can be set to
362 delete the unzipped file.
363
364 Parameters
365 ----------
366 vcfname : str
367 Name of uncompressed VCF file
368 forceflag : bool (False)
369 If true, will overwrite (vcfname).gz if it exists
370 remove : bool (False)
371 If true, will delete uncompressed source file
372
373 Returns
374 -------
375 cvcfname : str
376 Filepath to compressed VCF file
377 """
378 cvcfname = vcfname+".gz"
379 pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
380 pysam.tabix_index(cvcfname,preset="vcf",force=True)
381 if remove:
382 os.remove(vcfname)
383 return cvcfname
384
385 def vcfRegionName(prefix, region, ext, oneidx=False,
386 halfopen=True, sep='-'):
387 chrom = region.toStr(halfopen, oneidx, sep)
388 return prefix+'_'+chrom+'.'+ext
389
390 def getRecordsInRegion(region, record_list):
391 sub_list = []
392 for i in range(len(record_list)):
393 loc = region.containsRecord(record_list[i])
394 if loc == "in":
395 sub_list.append(record_list[i])
396 elif loc == "after":
397 break
398 return sub_list
399
400
401
402
403
404 #def getVcfReader(args):
405 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
406 subsamp_fn=None, subsamp_list=None, index=None):
407 """Returns a reader for a given input VCF file.
408
409 Given a filename, filetype, compression option, and optional Subsampling
410 options, will return a pysam.VariantFile object for iteration and
411 a flag as to whether this file is compressed or uncompressed.
412
413 Parameters
414 ----------
415 vcfname : str
416 Filename for VCF file. The extension of this file will be used to
417 determine whether it is compressed or not unless `var_ext` is set.
418 var_ext : str (None)
419 Extension for VCF file if it is not included in the filename.
420 compress_flag : bool (False)
421 If filetype is uncompressed and this is set to true, will run
422 compressVcf function.
423 subsamp_num : int (None)
424 If set, will randomly select `subsamp_num` individuals (not
425 genotypes) from the input VCF file and return a reader with
426 only those data.
427 subsamp_fn : str (None)
428 If set, will return a reader with only data from the samples listed
429 in the file provided. Cannot be used with other subsampling options.
430 subsamp_list : list (None)
431 If set, will return reader with records containing only
432 individuals named in the list. Cannot be used with other subsampling
433 options.
434
435 Returns
436 -------
437 vcf_reader : pysam.VariantFile
438 A reader that can be iterated through for variant records. If
439 compressed, it will be able to use the pysam fetch method, otherwise
440 it must be read through sequentially
441 reader_uncompressed : bool
442 If True, VCF reader is uncompressed. This means the fetch method
443 cannot be used and region access must be done using the
444 "getRecordListUnzipped" method.
445
446 """
447 ext = checkFormat(vcfname)
448 if ext in ['gzip','other'] :
449 raise Exception(('Input file %s is gzip-formatted, must be either '
450 'uncompressed or zipped with bgzip' % vcfname))
451 file_uncompressed = (ext == 'vcf')
452 reader_uncompressed = (file_uncompressed and not compress_flag)
453 if compress_flag and file_uncompressed:
454 vcfname = compressVcf(vcfname)
455 #subsamp_list = None
456 if subsamp_num is not None:
457 if subsamp_list is not None:
458 raise Exception('Multiple subsampling options called in getVcfReader')
459 subsamp_list = getSubsampleList(vcfname, subsamp_num)
460 elif subsamp_fn is not None:
461 if subsamp_list is not None:
462 raise Exception('Multiple subsampling options called in getVcfReader')
463 subsamp_file = open(subsamp_fn,'r')
464 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
465 subsamp_file.close()
466 if index is None:
467 vcf_reader = pysam.VariantFile(vcfname)
468 else:
469 vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
470 if subsamp_list is not None:
471 logging.debug('Subsampling %d individuals from VCF file' %
472 (len(subsamp_list)))
473 vcf_reader.subset_samples(subsamp_list)
474 return vcf_reader, reader_uncompressed