Mercurial > repos > jaredgk > ppp_vcfphase
comparison vcf_reader_func.py @ 0:3830d29fca6a draft
Uploaded
author | jaredgk |
---|---|
date | Mon, 15 Oct 2018 18:15:47 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3830d29fca6a |
---|---|
1 import sys | |
2 import pysam | |
3 import logging | |
4 import struct | |
5 from random import sample | |
6 from collections import defaultdict | |
7 import os | |
8 import gzip | |
9 | |
10 def checkIfGzip(filename): | |
11 try: | |
12 gf = gzip.open(filename) | |
13 gl = gf.readline() | |
14 gf.close() | |
15 vcf_check = b'##fileformat=VCF' | |
16 if gl[0:3] == b'BCF': | |
17 return 'bcf' | |
18 elif gl[:len(vcf_check)] == vcf_check: | |
19 return checkHeader(filename) | |
20 else: | |
21 return 'other' | |
22 except: | |
23 return 'nozip' | |
24 | |
25 def checkHeader(filename): | |
26 f = open(filename,'rb') | |
27 l = f.readline() | |
28 f.close() | |
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00' | |
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff' | |
31 GZF_HEADER=b'\x1f\x8b' | |
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER: | |
33 return 'bgzip' | |
34 if l[:len(GZF_HEADER)] == GZF_HEADER: | |
35 return 'gzip' | |
36 return 'nozip' | |
37 | |
38 def checkFormat(vcfname): | |
39 """Checks header of given file for compression type | |
40 | |
41 | |
42 Given a filename, opens file and reads first line to check if | |
43 file has BGZF or GZIP header. May be extended to check for BCF format | |
44 | |
45 Parameters | |
46 ---------- | |
47 filename : str | |
48 Name of file to be checked | |
49 | |
50 Returns | |
51 ------- | |
52 extension : str {'bgzip','gzip','vcf','other'} | |
53 File extension as indicated by header | |
54 | |
55 """ | |
56 typ = checkIfGzip(vcfname) | |
57 if typ != 'nozip': | |
58 return typ | |
59 f = open(vcfname) | |
60 l = f.readline() | |
61 f.close() | |
62 VCF_TAG='##fileformat=VCF' | |
63 if l[:len(VCF_TAG)] == VCF_TAG: | |
64 return 'vcf' | |
65 return 'other' | |
66 | |
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False): | |
68 dr = None | |
69 pos = record.pos | |
70 c = record.chrom | |
71 if record.alts is None: | |
72 return False | |
73 if add_chr: | |
74 c = 'chr'+record.chrom | |
75 if record.ref == 'C' and 'T' in record.alts: | |
76 seq = fasta_ref.fetch(c,pos-1,pos+1) | |
77 if seq[0].upper() != 'C': | |
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0])) | |
79 #raise Exception("checkIfCpG function not lining up properly") | |
80 if seq[1].upper() == 'G': | |
81 return True | |
82 return False | |
83 elif record.ref == 'G' and 'A' in record.alts: | |
84 seq = fasta_ref.fetch(c,pos-2,pos) | |
85 if seq[1].upper() != 'G': | |
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1])) | |
87 #raise Exception("checkIfCpg function not lining up on negative strand") | |
88 if seq[0].upper() == 'C': | |
89 return True | |
90 return False | |
91 return False | |
92 | |
93 def checkForDuplicates(rec_list,pass_list): | |
94 for i in range(len(rec_list)-1): | |
95 if rec_list[i].pos == rec_list[i+1].pos: | |
96 pass_list[i] = False | |
97 pass_list[i+1] = False | |
98 | |
99 def checkForMultiallele(rec_list,pass_list): | |
100 for i in range(len(rec_list)): | |
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos: | |
102 pass_list[i] = False | |
103 pass_list[i+1] = False | |
104 if len(rec_list[i].alleles) > 2: | |
105 pass_list[i] = False | |
106 | |
107 def flipChrom(chrom): | |
108 if chrom[0:3] == 'chr': | |
109 return chrom[0:3] | |
110 return 'chr'+chrom | |
111 | |
112 def getAlleleCountDict(rec): | |
113 alleles = defaultdict(int) | |
114 total_sites = 0 | |
115 missing_inds = 0 | |
116 for j in range(len(rec.samples)): | |
117 samp = rec.samples[j] | |
118 if None in samp.alleles: | |
119 missing_inds += 1 | |
120 for k in range(len(samp.alleles)): | |
121 b = samp.alleles[k] | |
122 if b is not None: | |
123 alleles[b] += 1 | |
124 total_sites+=1 | |
125 return alleles, total_sites, missing_inds | |
126 | |
127 def isInformative(rec, mincount=2, alleles=None): | |
128 count = 0 | |
129 if alleles is None: | |
130 alleles, total_sites, missing_inds = getAlleleCountDict(rec) | |
131 if len(alleles) != 2: | |
132 return False | |
133 i1,i2 = alleles.keys() | |
134 return (alleles[i1] >= mincount and alleles[i2] >= mincount) | |
135 | |
136 def getPassSites(record_list, remove_cpg=False, remove_indels=True, | |
137 remove_multiallele=True, remove_missing=0, | |
138 inform_level=2, fasta_ref=None): | |
139 pass_list = [True for r in record_list] | |
140 if remove_cpg == True and fasta_ref is None: | |
141 raise Exception("CpG removal requires a reference") | |
142 if inform_level > 2 or inform_level < 0: | |
143 raise Exception("Inform level %d must be between 0 and 2" % inform_level) | |
144 if remove_multiallele: | |
145 checkForMultiallele(record_list,pass_list) | |
146 for i in range(len(record_list)): | |
147 rec = record_list[i] | |
148 logging.info(rec.pos) | |
149 if remove_indels and not checkRecordIsSnp(rec): | |
150 pass_list[i] = False | |
151 if remove_cpg and checkIfCpG(rec,fasta_ref): | |
152 pass_list[i] = False | |
153 alleles,total_sites,missing_inds = getAlleleCountDict(rec) | |
154 if remove_missing != -1 and missing_inds > remove_missing: | |
155 pass_list[i] = False | |
156 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles): | |
157 pass_list[i] = False | |
158 #pp = zip([r.pos for r in record_list],pass_list) | |
159 #for ppp in pp: | |
160 # logging.info(ppp) | |
161 return pass_list | |
162 | |
163 | |
164 def filterSites(record_list, remove_cpg=False, remove_indels=True, | |
165 remove_multiallele=True, remove_missing=0, inform_level=2, | |
166 fasta_ref=None): | |
167 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref) | |
168 out_list = [] | |
169 for i in range(len(pass_list)): | |
170 if pass_list[i]: | |
171 out_list.append(record_list[i]) | |
172 return out_list | |
173 | |
174 | |
175 | |
176 class VcfReader(): | |
177 def __init__(self, vcfname, compress_flag=False, subsamp_num=None, | |
178 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False): | |
179 | |
180 ext = checkFormat(vcfname) | |
181 if ext in ['gzip','other'] : | |
182 raise Exception(('Input file %s is gzip-formatted, must be either ' | |
183 'uncompressed or zipped with bgzip' % vcfname)) | |
184 self.file_uncompressed = (ext == 'vcf') | |
185 self.reader_uncompressed = (self.file_uncompressed and not compress_flag) | |
186 self.popmodel = None | |
187 self.popkeys = None | |
188 if popmodel is not None and use_allpop: | |
189 raise Exception("Popmodel and allpop cannot both be specified") | |
190 if compress_flag and file_uncompressed: | |
191 vcfname = compressVcf(vcfname) | |
192 if subsamp_num is not None: | |
193 if subsamp_list is not None: | |
194 raise Exception('Multiple subsampling options called in getVcfReader') | |
195 subsamp_list = getSubsampleList(vcfname, subsamp_num) | |
196 elif subsamp_fn is not None: | |
197 if subsamp_list is not None: | |
198 raise Exception('Multiple subsampling options called in getVcfReader') | |
199 subsamp_file = open(subsamp_fn,'r') | |
200 subsamp_list = [l.strip() for l in subsamp_file.readlines()] | |
201 subsamp_file.close() | |
202 | |
203 if index is None: | |
204 self.reader = pysam.VariantFile(vcfname) | |
205 else: | |
206 self.reader = pysam.VariantFile(vcfname, index_filename=index) | |
207 if popmodel is not None: | |
208 self.popmodel = popmodel | |
209 popsamp_list = popmodel.inds | |
210 self.reader.subset_samples(popsamp_list) | |
211 self.setPopIdx() | |
212 if use_allpop: | |
213 self.setAllPop() | |
214 if subsamp_list is not None: | |
215 logging.debug('Subsampling %d individuals from VCF file' % | |
216 (len(subsamp_list))) | |
217 self.reader.subset_samples(subsamp_list) | |
218 self.prev_last_rec = next(self.reader) | |
219 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr') | |
220 | |
221 def fetch(self, chrom=None, start=None, end=None): | |
222 return self.reader.fetch(chrom, start, end) | |
223 | |
224 def getRecordList(self, region=None, chrom=None, start=None, | |
225 end=None): | |
226 if self.reader_uncompressed: | |
227 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom) | |
228 return ret | |
229 else: | |
230 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom) | |
231 | |
232 def setPopIdx(self): | |
233 self.popkeys = {} | |
234 sample_names = [l for l in self.reader.header.samples] | |
235 for p in self.popmodel.pop_list: | |
236 self.popkeys[p] = [] | |
237 for ind in self.popmodel.ind_dict[p]: | |
238 self.popkeys[p].append(sample_names.index(ind)) | |
239 | |
240 def close(self): | |
241 self.reader.close() | |
242 | |
243 def setAllPop(self): | |
244 self.popkeys = {'ALL':[]} | |
245 for i in range(len(self.reader.header.samples)): | |
246 self.popkeys['ALL'].append(i) | |
247 | |
248 def modChrom(c,vcf_chr): | |
249 if c is None: | |
250 return None | |
251 if vcf_chr and c[:3] != 'chr': | |
252 return 'chr'+c | |
253 if not vcf_chr and c[:3] == 'chr': | |
254 return c[3:] | |
255 return c | |
256 | |
257 def getRecordList(vcf_reader, region=None, chrom=None, start=None, | |
258 end=None, add_chr=False): | |
259 """Returns list for use in subsampling from input file""" | |
260 if region is not None: | |
261 c = modChrom(region.chrom,add_chr) | |
262 var_sites = vcf_reader.fetch(c, region.start, region.end) | |
263 else: | |
264 c = modChrom(chrom,add_chr) | |
265 var_sites = vcf_reader.fetch(c, start, end) | |
266 lst = [] | |
267 for rec in var_sites: | |
268 lst.append(rec) | |
269 return lst | |
270 | |
271 | |
272 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None, | |
273 start=None, end=None, add_chr=False): | |
274 """Method for getting record list from unzipped VCF file. | |
275 | |
276 This method will sequentially look through a VCF file until it finds | |
277 the given `start` position on `chrom`, then add all records to a list | |
278 until the `end` position has been reached. Note that `prev_last_rec` | |
279 must be kept track of externally to ensure that if consecutive regions | |
280 are called, the record of the first variant outside the first region | |
281 is not lost. | |
282 | |
283 Parameters | |
284 ---------- | |
285 vcf_reader : pysam VariantFile object | |
286 VCF reader initialized from other function | |
287 region : region object | |
288 Region object with start and end coordinates of region of interest. | |
289 prev_last_rec : VariantRecord object | |
290 Variable with last record read from VcfReader. Stored here so that | |
291 if two adjacent regions are called, the overflow record from the | |
292 first region is still included in the next region | |
293 | |
294 Returns | |
295 ------- | |
296 lst : list | |
297 List of records in given gene region | |
298 prev_last_rec : VariantRecord object | |
299 First record after target region, for use in next call | |
300 | |
301 """ | |
302 lst = [] | |
303 if region is None: | |
304 lst.append(prev_last_rec) | |
305 for rec in vcf_reader: | |
306 lst.append(rec) | |
307 return lst, lst[-1] | |
308 | |
309 if (prev_last_rec is not None and | |
310 region.containsRecord(prev_last_rec) == 'in'): | |
311 lst.append(prev_last_rec) | |
312 elif (prev_last_rec is not None and | |
313 region.containsRecord(prev_last_rec) == 'after'): | |
314 return [] | |
315 rec = next(vcf_reader,None) | |
316 if rec is None: | |
317 return lst,None | |
318 place = region.containsRecord(rec) | |
319 while rec is not None and place != 'after': | |
320 if place == 'in': | |
321 lst.append(rec) | |
322 rec = next(vcf_reader,None) | |
323 if rec is None: | |
324 break | |
325 place = region.containsRecord(rec) | |
326 prev_last_rec = rec | |
327 return lst, prev_last_rec | |
328 | |
329 | |
330 def checkRecordIsSnp(rec): | |
331 """Checks if this record is a single nucleotide variant, returns bool.""" | |
332 if len(rec.ref) != 1: | |
333 return False | |
334 if rec.alts is None: | |
335 return False | |
336 for allele in rec.alts: | |
337 if len(allele) != 1: | |
338 return False | |
339 return True | |
340 | |
341 | |
342 def getSubsampleList(vcfname, ss_count): | |
343 """Returns a list of the first `ss_count` individuals in `vcfname` | |
344 | |
345 """ | |
346 | |
347 vcf_o = pysam.VariantFile(vcfname) | |
348 rec = next(vcf_o) | |
349 vcf_o.close() | |
350 lst = [] | |
351 for samp in rec.samples: | |
352 lst.append(samp) | |
353 return lst[:int(ss_count)] | |
354 | |
355 | |
356 def compressVcf(vcfname,forceflag=False,remove=False): | |
357 """Runs bgzip and tabix on input VCF file. | |
358 | |
359 Using the pysam library, this function runs the bgzip and tabix utilities | |
360 on the given input file. By default, this will not overwrite an existing | |
361 zipped file, but will overwrite an existing index. `remove` can be set to | |
362 delete the unzipped file. | |
363 | |
364 Parameters | |
365 ---------- | |
366 vcfname : str | |
367 Name of uncompressed VCF file | |
368 forceflag : bool (False) | |
369 If true, will overwrite (vcfname).gz if it exists | |
370 remove : bool (False) | |
371 If true, will delete uncompressed source file | |
372 | |
373 Returns | |
374 ------- | |
375 cvcfname : str | |
376 Filepath to compressed VCF file | |
377 """ | |
378 cvcfname = vcfname+".gz" | |
379 pysam.tabix_compress(vcfname,cvcfname,force=forceflag) | |
380 pysam.tabix_index(cvcfname,preset="vcf",force=True) | |
381 if remove: | |
382 os.remove(vcfname) | |
383 return cvcfname | |
384 | |
385 def vcfRegionName(prefix, region, ext, oneidx=False, | |
386 halfopen=True, sep='-'): | |
387 chrom = region.toStr(halfopen, oneidx, sep) | |
388 return prefix+'_'+chrom+'.'+ext | |
389 | |
390 def getRecordsInRegion(region, record_list): | |
391 sub_list = [] | |
392 for i in range(len(record_list)): | |
393 loc = region.containsRecord(record_list[i]) | |
394 if loc == "in": | |
395 sub_list.append(record_list[i]) | |
396 elif loc == "after": | |
397 break | |
398 return sub_list | |
399 | |
400 | |
401 | |
402 | |
403 | |
404 #def getVcfReader(args): | |
405 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None, | |
406 subsamp_fn=None, subsamp_list=None, index=None): | |
407 """Returns a reader for a given input VCF file. | |
408 | |
409 Given a filename, filetype, compression option, and optional Subsampling | |
410 options, will return a pysam.VariantFile object for iteration and | |
411 a flag as to whether this file is compressed or uncompressed. | |
412 | |
413 Parameters | |
414 ---------- | |
415 vcfname : str | |
416 Filename for VCF file. The extension of this file will be used to | |
417 determine whether it is compressed or not unless `var_ext` is set. | |
418 var_ext : str (None) | |
419 Extension for VCF file if it is not included in the filename. | |
420 compress_flag : bool (False) | |
421 If filetype is uncompressed and this is set to true, will run | |
422 compressVcf function. | |
423 subsamp_num : int (None) | |
424 If set, will randomly select `subsamp_num` individuals (not | |
425 genotypes) from the input VCF file and return a reader with | |
426 only those data. | |
427 subsamp_fn : str (None) | |
428 If set, will return a reader with only data from the samples listed | |
429 in the file provided. Cannot be used with other subsampling options. | |
430 subsamp_list : list (None) | |
431 If set, will return reader with records containing only | |
432 individuals named in the list. Cannot be used with other subsampling | |
433 options. | |
434 | |
435 Returns | |
436 ------- | |
437 vcf_reader : pysam.VariantFile | |
438 A reader that can be iterated through for variant records. If | |
439 compressed, it will be able to use the pysam fetch method, otherwise | |
440 it must be read through sequentially | |
441 reader_uncompressed : bool | |
442 If True, VCF reader is uncompressed. This means the fetch method | |
443 cannot be used and region access must be done using the | |
444 "getRecordListUnzipped" method. | |
445 | |
446 """ | |
447 ext = checkFormat(vcfname) | |
448 if ext in ['gzip','other'] : | |
449 raise Exception(('Input file %s is gzip-formatted, must be either ' | |
450 'uncompressed or zipped with bgzip' % vcfname)) | |
451 file_uncompressed = (ext == 'vcf') | |
452 reader_uncompressed = (file_uncompressed and not compress_flag) | |
453 if compress_flag and file_uncompressed: | |
454 vcfname = compressVcf(vcfname) | |
455 #subsamp_list = None | |
456 if subsamp_num is not None: | |
457 if subsamp_list is not None: | |
458 raise Exception('Multiple subsampling options called in getVcfReader') | |
459 subsamp_list = getSubsampleList(vcfname, subsamp_num) | |
460 elif subsamp_fn is not None: | |
461 if subsamp_list is not None: | |
462 raise Exception('Multiple subsampling options called in getVcfReader') | |
463 subsamp_file = open(subsamp_fn,'r') | |
464 subsamp_list = [l.strip() for l in subsamp_file.readlines()] | |
465 subsamp_file.close() | |
466 if index is None: | |
467 vcf_reader = pysam.VariantFile(vcfname) | |
468 else: | |
469 vcf_reader = pysam.VariantFile(vcfname, index_filename=index) | |
470 if subsamp_list is not None: | |
471 logging.debug('Subsampling %d individuals from VCF file' % | |
472 (len(subsamp_list))) | |
473 vcf_reader.subset_samples(subsamp_list) | |
474 return vcf_reader, reader_uncompressed |