Mercurial > repos > jaredgk > ppp_vcfphase
comparison vcf_reader_func.py @ 0:3830d29fca6a draft
Uploaded
| author | jaredgk |
|---|---|
| date | Mon, 15 Oct 2018 18:15:47 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3830d29fca6a |
|---|---|
| 1 import sys | |
| 2 import pysam | |
| 3 import logging | |
| 4 import struct | |
| 5 from random import sample | |
| 6 from collections import defaultdict | |
| 7 import os | |
| 8 import gzip | |
| 9 | |
| 10 def checkIfGzip(filename): | |
| 11 try: | |
| 12 gf = gzip.open(filename) | |
| 13 gl = gf.readline() | |
| 14 gf.close() | |
| 15 vcf_check = b'##fileformat=VCF' | |
| 16 if gl[0:3] == b'BCF': | |
| 17 return 'bcf' | |
| 18 elif gl[:len(vcf_check)] == vcf_check: | |
| 19 return checkHeader(filename) | |
| 20 else: | |
| 21 return 'other' | |
| 22 except: | |
| 23 return 'nozip' | |
| 24 | |
| 25 def checkHeader(filename): | |
| 26 f = open(filename,'rb') | |
| 27 l = f.readline() | |
| 28 f.close() | |
| 29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00' | |
| 30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff' | |
| 31 GZF_HEADER=b'\x1f\x8b' | |
| 32 if l[:len(BGZF_HEADER)] == BGZF_HEADER: | |
| 33 return 'bgzip' | |
| 34 if l[:len(GZF_HEADER)] == GZF_HEADER: | |
| 35 return 'gzip' | |
| 36 return 'nozip' | |
| 37 | |
| 38 def checkFormat(vcfname): | |
| 39 """Checks header of given file for compression type | |
| 40 | |
| 41 | |
| 42 Given a filename, opens file and reads first line to check if | |
| 43 file has BGZF or GZIP header. May be extended to check for BCF format | |
| 44 | |
| 45 Parameters | |
| 46 ---------- | |
| 47 filename : str | |
| 48 Name of file to be checked | |
| 49 | |
| 50 Returns | |
| 51 ------- | |
| 52 extension : str {'bgzip','gzip','vcf','other'} | |
| 53 File extension as indicated by header | |
| 54 | |
| 55 """ | |
| 56 typ = checkIfGzip(vcfname) | |
| 57 if typ != 'nozip': | |
| 58 return typ | |
| 59 f = open(vcfname) | |
| 60 l = f.readline() | |
| 61 f.close() | |
| 62 VCF_TAG='##fileformat=VCF' | |
| 63 if l[:len(VCF_TAG)] == VCF_TAG: | |
| 64 return 'vcf' | |
| 65 return 'other' | |
| 66 | |
| 67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False): | |
| 68 dr = None | |
| 69 pos = record.pos | |
| 70 c = record.chrom | |
| 71 if record.alts is None: | |
| 72 return False | |
| 73 if add_chr: | |
| 74 c = 'chr'+record.chrom | |
| 75 if record.ref == 'C' and 'T' in record.alts: | |
| 76 seq = fasta_ref.fetch(c,pos-1,pos+1) | |
| 77 if seq[0].upper() != 'C': | |
| 78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0])) | |
| 79 #raise Exception("checkIfCpG function not lining up properly") | |
| 80 if seq[1].upper() == 'G': | |
| 81 return True | |
| 82 return False | |
| 83 elif record.ref == 'G' and 'A' in record.alts: | |
| 84 seq = fasta_ref.fetch(c,pos-2,pos) | |
| 85 if seq[1].upper() != 'G': | |
| 86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1])) | |
| 87 #raise Exception("checkIfCpg function not lining up on negative strand") | |
| 88 if seq[0].upper() == 'C': | |
| 89 return True | |
| 90 return False | |
| 91 return False | |
| 92 | |
| 93 def checkForDuplicates(rec_list,pass_list): | |
| 94 for i in range(len(rec_list)-1): | |
| 95 if rec_list[i].pos == rec_list[i+1].pos: | |
| 96 pass_list[i] = False | |
| 97 pass_list[i+1] = False | |
| 98 | |
| 99 def checkForMultiallele(rec_list,pass_list): | |
| 100 for i in range(len(rec_list)): | |
| 101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos: | |
| 102 pass_list[i] = False | |
| 103 pass_list[i+1] = False | |
| 104 if len(rec_list[i].alleles) > 2: | |
| 105 pass_list[i] = False | |
| 106 | |
| 107 def flipChrom(chrom): | |
| 108 if chrom[0:3] == 'chr': | |
| 109 return chrom[0:3] | |
| 110 return 'chr'+chrom | |
| 111 | |
| 112 def getAlleleCountDict(rec): | |
| 113 alleles = defaultdict(int) | |
| 114 total_sites = 0 | |
| 115 missing_inds = 0 | |
| 116 for j in range(len(rec.samples)): | |
| 117 samp = rec.samples[j] | |
| 118 if None in samp.alleles: | |
| 119 missing_inds += 1 | |
| 120 for k in range(len(samp.alleles)): | |
| 121 b = samp.alleles[k] | |
| 122 if b is not None: | |
| 123 alleles[b] += 1 | |
| 124 total_sites+=1 | |
| 125 return alleles, total_sites, missing_inds | |
| 126 | |
| 127 def isInformative(rec, mincount=2, alleles=None): | |
| 128 count = 0 | |
| 129 if alleles is None: | |
| 130 alleles, total_sites, missing_inds = getAlleleCountDict(rec) | |
| 131 if len(alleles) != 2: | |
| 132 return False | |
| 133 i1,i2 = alleles.keys() | |
| 134 return (alleles[i1] >= mincount and alleles[i2] >= mincount) | |
| 135 | |
| 136 def getPassSites(record_list, remove_cpg=False, remove_indels=True, | |
| 137 remove_multiallele=True, remove_missing=0, | |
| 138 inform_level=2, fasta_ref=None): | |
| 139 pass_list = [True for r in record_list] | |
| 140 if remove_cpg == True and fasta_ref is None: | |
| 141 raise Exception("CpG removal requires a reference") | |
| 142 if inform_level > 2 or inform_level < 0: | |
| 143 raise Exception("Inform level %d must be between 0 and 2" % inform_level) | |
| 144 if remove_multiallele: | |
| 145 checkForMultiallele(record_list,pass_list) | |
| 146 for i in range(len(record_list)): | |
| 147 rec = record_list[i] | |
| 148 logging.info(rec.pos) | |
| 149 if remove_indels and not checkRecordIsSnp(rec): | |
| 150 pass_list[i] = False | |
| 151 if remove_cpg and checkIfCpG(rec,fasta_ref): | |
| 152 pass_list[i] = False | |
| 153 alleles,total_sites,missing_inds = getAlleleCountDict(rec) | |
| 154 if remove_missing != -1 and missing_inds > remove_missing: | |
| 155 pass_list[i] = False | |
| 156 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles): | |
| 157 pass_list[i] = False | |
| 158 #pp = zip([r.pos for r in record_list],pass_list) | |
| 159 #for ppp in pp: | |
| 160 # logging.info(ppp) | |
| 161 return pass_list | |
| 162 | |
| 163 | |
| 164 def filterSites(record_list, remove_cpg=False, remove_indels=True, | |
| 165 remove_multiallele=True, remove_missing=0, inform_level=2, | |
| 166 fasta_ref=None): | |
| 167 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref) | |
| 168 out_list = [] | |
| 169 for i in range(len(pass_list)): | |
| 170 if pass_list[i]: | |
| 171 out_list.append(record_list[i]) | |
| 172 return out_list | |
| 173 | |
| 174 | |
| 175 | |
| 176 class VcfReader(): | |
| 177 def __init__(self, vcfname, compress_flag=False, subsamp_num=None, | |
| 178 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False): | |
| 179 | |
| 180 ext = checkFormat(vcfname) | |
| 181 if ext in ['gzip','other'] : | |
| 182 raise Exception(('Input file %s is gzip-formatted, must be either ' | |
| 183 'uncompressed or zipped with bgzip' % vcfname)) | |
| 184 self.file_uncompressed = (ext == 'vcf') | |
| 185 self.reader_uncompressed = (self.file_uncompressed and not compress_flag) | |
| 186 self.popmodel = None | |
| 187 self.popkeys = None | |
| 188 if popmodel is not None and use_allpop: | |
| 189 raise Exception("Popmodel and allpop cannot both be specified") | |
| 190 if compress_flag and file_uncompressed: | |
| 191 vcfname = compressVcf(vcfname) | |
| 192 if subsamp_num is not None: | |
| 193 if subsamp_list is not None: | |
| 194 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 195 subsamp_list = getSubsampleList(vcfname, subsamp_num) | |
| 196 elif subsamp_fn is not None: | |
| 197 if subsamp_list is not None: | |
| 198 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 199 subsamp_file = open(subsamp_fn,'r') | |
| 200 subsamp_list = [l.strip() for l in subsamp_file.readlines()] | |
| 201 subsamp_file.close() | |
| 202 | |
| 203 if index is None: | |
| 204 self.reader = pysam.VariantFile(vcfname) | |
| 205 else: | |
| 206 self.reader = pysam.VariantFile(vcfname, index_filename=index) | |
| 207 if popmodel is not None: | |
| 208 self.popmodel = popmodel | |
| 209 popsamp_list = popmodel.inds | |
| 210 self.reader.subset_samples(popsamp_list) | |
| 211 self.setPopIdx() | |
| 212 if use_allpop: | |
| 213 self.setAllPop() | |
| 214 if subsamp_list is not None: | |
| 215 logging.debug('Subsampling %d individuals from VCF file' % | |
| 216 (len(subsamp_list))) | |
| 217 self.reader.subset_samples(subsamp_list) | |
| 218 self.prev_last_rec = next(self.reader) | |
| 219 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr') | |
| 220 | |
| 221 def fetch(self, chrom=None, start=None, end=None): | |
| 222 return self.reader.fetch(chrom, start, end) | |
| 223 | |
| 224 def getRecordList(self, region=None, chrom=None, start=None, | |
| 225 end=None): | |
| 226 if self.reader_uncompressed: | |
| 227 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom) | |
| 228 return ret | |
| 229 else: | |
| 230 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom) | |
| 231 | |
| 232 def setPopIdx(self): | |
| 233 self.popkeys = {} | |
| 234 sample_names = [l for l in self.reader.header.samples] | |
| 235 for p in self.popmodel.pop_list: | |
| 236 self.popkeys[p] = [] | |
| 237 for ind in self.popmodel.ind_dict[p]: | |
| 238 self.popkeys[p].append(sample_names.index(ind)) | |
| 239 | |
| 240 def close(self): | |
| 241 self.reader.close() | |
| 242 | |
| 243 def setAllPop(self): | |
| 244 self.popkeys = {'ALL':[]} | |
| 245 for i in range(len(self.reader.header.samples)): | |
| 246 self.popkeys['ALL'].append(i) | |
| 247 | |
| 248 def modChrom(c,vcf_chr): | |
| 249 if c is None: | |
| 250 return None | |
| 251 if vcf_chr and c[:3] != 'chr': | |
| 252 return 'chr'+c | |
| 253 if not vcf_chr and c[:3] == 'chr': | |
| 254 return c[3:] | |
| 255 return c | |
| 256 | |
| 257 def getRecordList(vcf_reader, region=None, chrom=None, start=None, | |
| 258 end=None, add_chr=False): | |
| 259 """Returns list for use in subsampling from input file""" | |
| 260 if region is not None: | |
| 261 c = modChrom(region.chrom,add_chr) | |
| 262 var_sites = vcf_reader.fetch(c, region.start, region.end) | |
| 263 else: | |
| 264 c = modChrom(chrom,add_chr) | |
| 265 var_sites = vcf_reader.fetch(c, start, end) | |
| 266 lst = [] | |
| 267 for rec in var_sites: | |
| 268 lst.append(rec) | |
| 269 return lst | |
| 270 | |
| 271 | |
| 272 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None, | |
| 273 start=None, end=None, add_chr=False): | |
| 274 """Method for getting record list from unzipped VCF file. | |
| 275 | |
| 276 This method will sequentially look through a VCF file until it finds | |
| 277 the given `start` position on `chrom`, then add all records to a list | |
| 278 until the `end` position has been reached. Note that `prev_last_rec` | |
| 279 must be kept track of externally to ensure that if consecutive regions | |
| 280 are called, the record of the first variant outside the first region | |
| 281 is not lost. | |
| 282 | |
| 283 Parameters | |
| 284 ---------- | |
| 285 vcf_reader : pysam VariantFile object | |
| 286 VCF reader initialized from other function | |
| 287 region : region object | |
| 288 Region object with start and end coordinates of region of interest. | |
| 289 prev_last_rec : VariantRecord object | |
| 290 Variable with last record read from VcfReader. Stored here so that | |
| 291 if two adjacent regions are called, the overflow record from the | |
| 292 first region is still included in the next region | |
| 293 | |
| 294 Returns | |
| 295 ------- | |
| 296 lst : list | |
| 297 List of records in given gene region | |
| 298 prev_last_rec : VariantRecord object | |
| 299 First record after target region, for use in next call | |
| 300 | |
| 301 """ | |
| 302 lst = [] | |
| 303 if region is None: | |
| 304 lst.append(prev_last_rec) | |
| 305 for rec in vcf_reader: | |
| 306 lst.append(rec) | |
| 307 return lst, lst[-1] | |
| 308 | |
| 309 if (prev_last_rec is not None and | |
| 310 region.containsRecord(prev_last_rec) == 'in'): | |
| 311 lst.append(prev_last_rec) | |
| 312 elif (prev_last_rec is not None and | |
| 313 region.containsRecord(prev_last_rec) == 'after'): | |
| 314 return [] | |
| 315 rec = next(vcf_reader,None) | |
| 316 if rec is None: | |
| 317 return lst,None | |
| 318 place = region.containsRecord(rec) | |
| 319 while rec is not None and place != 'after': | |
| 320 if place == 'in': | |
| 321 lst.append(rec) | |
| 322 rec = next(vcf_reader,None) | |
| 323 if rec is None: | |
| 324 break | |
| 325 place = region.containsRecord(rec) | |
| 326 prev_last_rec = rec | |
| 327 return lst, prev_last_rec | |
| 328 | |
| 329 | |
| 330 def checkRecordIsSnp(rec): | |
| 331 """Checks if this record is a single nucleotide variant, returns bool.""" | |
| 332 if len(rec.ref) != 1: | |
| 333 return False | |
| 334 if rec.alts is None: | |
| 335 return False | |
| 336 for allele in rec.alts: | |
| 337 if len(allele) != 1: | |
| 338 return False | |
| 339 return True | |
| 340 | |
| 341 | |
| 342 def getSubsampleList(vcfname, ss_count): | |
| 343 """Returns a list of the first `ss_count` individuals in `vcfname` | |
| 344 | |
| 345 """ | |
| 346 | |
| 347 vcf_o = pysam.VariantFile(vcfname) | |
| 348 rec = next(vcf_o) | |
| 349 vcf_o.close() | |
| 350 lst = [] | |
| 351 for samp in rec.samples: | |
| 352 lst.append(samp) | |
| 353 return lst[:int(ss_count)] | |
| 354 | |
| 355 | |
| 356 def compressVcf(vcfname,forceflag=False,remove=False): | |
| 357 """Runs bgzip and tabix on input VCF file. | |
| 358 | |
| 359 Using the pysam library, this function runs the bgzip and tabix utilities | |
| 360 on the given input file. By default, this will not overwrite an existing | |
| 361 zipped file, but will overwrite an existing index. `remove` can be set to | |
| 362 delete the unzipped file. | |
| 363 | |
| 364 Parameters | |
| 365 ---------- | |
| 366 vcfname : str | |
| 367 Name of uncompressed VCF file | |
| 368 forceflag : bool (False) | |
| 369 If true, will overwrite (vcfname).gz if it exists | |
| 370 remove : bool (False) | |
| 371 If true, will delete uncompressed source file | |
| 372 | |
| 373 Returns | |
| 374 ------- | |
| 375 cvcfname : str | |
| 376 Filepath to compressed VCF file | |
| 377 """ | |
| 378 cvcfname = vcfname+".gz" | |
| 379 pysam.tabix_compress(vcfname,cvcfname,force=forceflag) | |
| 380 pysam.tabix_index(cvcfname,preset="vcf",force=True) | |
| 381 if remove: | |
| 382 os.remove(vcfname) | |
| 383 return cvcfname | |
| 384 | |
| 385 def vcfRegionName(prefix, region, ext, oneidx=False, | |
| 386 halfopen=True, sep='-'): | |
| 387 chrom = region.toStr(halfopen, oneidx, sep) | |
| 388 return prefix+'_'+chrom+'.'+ext | |
| 389 | |
| 390 def getRecordsInRegion(region, record_list): | |
| 391 sub_list = [] | |
| 392 for i in range(len(record_list)): | |
| 393 loc = region.containsRecord(record_list[i]) | |
| 394 if loc == "in": | |
| 395 sub_list.append(record_list[i]) | |
| 396 elif loc == "after": | |
| 397 break | |
| 398 return sub_list | |
| 399 | |
| 400 | |
| 401 | |
| 402 | |
| 403 | |
| 404 #def getVcfReader(args): | |
| 405 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None, | |
| 406 subsamp_fn=None, subsamp_list=None, index=None): | |
| 407 """Returns a reader for a given input VCF file. | |
| 408 | |
| 409 Given a filename, filetype, compression option, and optional Subsampling | |
| 410 options, will return a pysam.VariantFile object for iteration and | |
| 411 a flag as to whether this file is compressed or uncompressed. | |
| 412 | |
| 413 Parameters | |
| 414 ---------- | |
| 415 vcfname : str | |
| 416 Filename for VCF file. The extension of this file will be used to | |
| 417 determine whether it is compressed or not unless `var_ext` is set. | |
| 418 var_ext : str (None) | |
| 419 Extension for VCF file if it is not included in the filename. | |
| 420 compress_flag : bool (False) | |
| 421 If filetype is uncompressed and this is set to true, will run | |
| 422 compressVcf function. | |
| 423 subsamp_num : int (None) | |
| 424 If set, will randomly select `subsamp_num` individuals (not | |
| 425 genotypes) from the input VCF file and return a reader with | |
| 426 only those data. | |
| 427 subsamp_fn : str (None) | |
| 428 If set, will return a reader with only data from the samples listed | |
| 429 in the file provided. Cannot be used with other subsampling options. | |
| 430 subsamp_list : list (None) | |
| 431 If set, will return reader with records containing only | |
| 432 individuals named in the list. Cannot be used with other subsampling | |
| 433 options. | |
| 434 | |
| 435 Returns | |
| 436 ------- | |
| 437 vcf_reader : pysam.VariantFile | |
| 438 A reader that can be iterated through for variant records. If | |
| 439 compressed, it will be able to use the pysam fetch method, otherwise | |
| 440 it must be read through sequentially | |
| 441 reader_uncompressed : bool | |
| 442 If True, VCF reader is uncompressed. This means the fetch method | |
| 443 cannot be used and region access must be done using the | |
| 444 "getRecordListUnzipped" method. | |
| 445 | |
| 446 """ | |
| 447 ext = checkFormat(vcfname) | |
| 448 if ext in ['gzip','other'] : | |
| 449 raise Exception(('Input file %s is gzip-formatted, must be either ' | |
| 450 'uncompressed or zipped with bgzip' % vcfname)) | |
| 451 file_uncompressed = (ext == 'vcf') | |
| 452 reader_uncompressed = (file_uncompressed and not compress_flag) | |
| 453 if compress_flag and file_uncompressed: | |
| 454 vcfname = compressVcf(vcfname) | |
| 455 #subsamp_list = None | |
| 456 if subsamp_num is not None: | |
| 457 if subsamp_list is not None: | |
| 458 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 459 subsamp_list = getSubsampleList(vcfname, subsamp_num) | |
| 460 elif subsamp_fn is not None: | |
| 461 if subsamp_list is not None: | |
| 462 raise Exception('Multiple subsampling options called in getVcfReader') | |
| 463 subsamp_file = open(subsamp_fn,'r') | |
| 464 subsamp_list = [l.strip() for l in subsamp_file.readlines()] | |
| 465 subsamp_file.close() | |
| 466 if index is None: | |
| 467 vcf_reader = pysam.VariantFile(vcfname) | |
| 468 else: | |
| 469 vcf_reader = pysam.VariantFile(vcfname, index_filename=index) | |
| 470 if subsamp_list is not None: | |
| 471 logging.debug('Subsampling %d individuals from VCF file' % | |
| 472 (len(subsamp_list))) | |
| 473 vcf_reader.subset_samples(subsamp_list) | |
| 474 return vcf_reader, reader_uncompressed |
