diff vcf_reader_func.py @ 0:3830d29fca6a draft

Uploaded
author jaredgk
date Mon, 15 Oct 2018 18:15:47 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_reader_func.py	Mon Oct 15 18:15:47 2018 -0400
@@ -0,0 +1,474 @@
+import sys
+import pysam
+import logging
+import struct
+from random import sample
+from collections import defaultdict
+import os
+import gzip
+
+def checkIfGzip(filename):
+    try:
+        gf = gzip.open(filename)
+        gl = gf.readline()
+        gf.close()
+        vcf_check = b'##fileformat=VCF'
+        if gl[0:3] == b'BCF':
+            return 'bcf'
+        elif gl[:len(vcf_check)] == vcf_check:
+            return checkHeader(filename)
+        else:
+            return 'other'
+    except:
+        return 'nozip'
+
+def checkHeader(filename):
+    f = open(filename,'rb')
+    l = f.readline()
+    f.close()
+    BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
+    #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
+    GZF_HEADER=b'\x1f\x8b'
+    if l[:len(BGZF_HEADER)] == BGZF_HEADER:
+        return 'bgzip'
+    if l[:len(GZF_HEADER)] == GZF_HEADER:
+        return 'gzip'
+    return 'nozip'
+
+def checkFormat(vcfname):
+    """Checks header of given file for compression type
+
+
+    Given a filename, opens file and reads first line to check if
+    file has BGZF or GZIP header. May be extended to check for BCF format
+
+    Parameters
+    ----------
+    filename : str
+        Name of file to be checked
+
+    Returns
+    -------
+    extension : str {'bgzip','gzip','vcf','other'}
+        File extension as indicated by header
+
+    """
+    typ = checkIfGzip(vcfname)
+    if typ != 'nozip':
+        return typ
+    f = open(vcfname)
+    l = f.readline()
+    f.close()
+    VCF_TAG='##fileformat=VCF'
+    if l[:len(VCF_TAG)] == VCF_TAG:
+        return 'vcf'
+    return 'other'
+
+def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
+    dr = None
+    pos = record.pos
+    c = record.chrom
+    if record.alts is None:
+        return False
+    if add_chr:
+        c = 'chr'+record.chrom
+    if record.ref == 'C' and 'T' in record.alts:
+        seq = fasta_ref.fetch(c,pos-1,pos+1)
+        if seq[0].upper() != 'C':
+            logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
+            #raise Exception("checkIfCpG function not lining up properly")
+        if seq[1].upper() == 'G':
+            return True
+        return False
+    elif record.ref == 'G' and 'A' in record.alts:
+        seq = fasta_ref.fetch(c,pos-2,pos)
+        if seq[1].upper() != 'G':
+            logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
+            #raise Exception("checkIfCpg function not lining up on negative strand")
+        if seq[0].upper() == 'C':
+            return True
+        return False
+    return False
+
+def checkForDuplicates(rec_list,pass_list):
+    for i in range(len(rec_list)-1):
+        if rec_list[i].pos == rec_list[i+1].pos:
+            pass_list[i] = False
+            pass_list[i+1] = False
+
+def checkForMultiallele(rec_list,pass_list):
+    for i in range(len(rec_list)):
+        if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
+            pass_list[i] = False
+            pass_list[i+1] = False
+        if len(rec_list[i].alleles) > 2:
+            pass_list[i] = False
+
+def flipChrom(chrom):
+    if chrom[0:3] == 'chr':
+        return chrom[0:3]
+    return 'chr'+chrom
+
+def getAlleleCountDict(rec):
+    alleles = defaultdict(int)
+    total_sites = 0
+    missing_inds = 0
+    for j in range(len(rec.samples)):
+        samp = rec.samples[j]
+        if None in samp.alleles:
+            missing_inds += 1
+        for k in range(len(samp.alleles)):
+            b = samp.alleles[k]
+            if b is not None:
+                alleles[b] += 1
+            total_sites+=1
+    return alleles, total_sites, missing_inds
+
+def isInformative(rec, mincount=2, alleles=None):
+    count = 0
+    if alleles is None:
+        alleles, total_sites, missing_inds = getAlleleCountDict(rec)
+    if len(alleles) != 2:
+        return False
+    i1,i2 = alleles.keys()
+    return (alleles[i1] >= mincount and alleles[i2] >= mincount)
+
+def getPassSites(record_list, remove_cpg=False, remove_indels=True,
+                 remove_multiallele=True, remove_missing=0,
+                 inform_level=2, fasta_ref=None):
+    pass_list = [True for r in record_list]
+    if remove_cpg == True and fasta_ref is None:
+        raise Exception("CpG removal requires a reference")
+    if inform_level > 2 or inform_level < 0:
+        raise Exception("Inform level %d must be between 0 and 2" % inform_level)
+    if remove_multiallele:
+        checkForMultiallele(record_list,pass_list)
+    for i in range(len(record_list)):
+        rec = record_list[i]
+        logging.info(rec.pos)
+        if remove_indels and not checkRecordIsSnp(rec):
+            pass_list[i] = False
+        if remove_cpg and checkIfCpG(rec,fasta_ref):
+            pass_list[i] = False
+        alleles,total_sites,missing_inds = getAlleleCountDict(rec)
+        if remove_missing != -1 and missing_inds > remove_missing:
+            pass_list[i] = False
+        if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
+            pass_list[i] = False
+    #pp = zip([r.pos for r in record_list],pass_list)
+    #for ppp in pp:
+    #    logging.info(ppp)
+    return pass_list
+
+
+def filterSites(record_list, remove_cpg=False, remove_indels=True,
+                remove_multiallele=True, remove_missing=0, inform_level=2,
+                fasta_ref=None):
+    pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
+    out_list = []
+    for i in range(len(pass_list)):
+        if pass_list[i]:
+            out_list.append(record_list[i])
+    return out_list
+
+
+
+class VcfReader():
+    def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
+                 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
+
+        ext = checkFormat(vcfname)
+        if ext in ['gzip','other'] :
+            raise Exception(('Input file %s is gzip-formatted, must be either '
+                             'uncompressed or zipped with bgzip' % vcfname))
+        self.file_uncompressed = (ext == 'vcf')
+        self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
+        self.popmodel = None
+        self.popkeys = None
+        if popmodel is not None and use_allpop:
+            raise Exception("Popmodel and allpop cannot both be specified")
+        if compress_flag and file_uncompressed:
+            vcfname = compressVcf(vcfname)
+        if subsamp_num is not None:
+            if subsamp_list is not None:
+                raise Exception('Multiple subsampling options called in getVcfReader')
+            subsamp_list = getSubsampleList(vcfname, subsamp_num)
+        elif subsamp_fn is not None:
+            if subsamp_list is not None:
+                raise Exception('Multiple subsampling options called in getVcfReader')
+            subsamp_file = open(subsamp_fn,'r')
+            subsamp_list = [l.strip() for l in subsamp_file.readlines()]
+            subsamp_file.close()
+
+        if index is None:
+            self.reader = pysam.VariantFile(vcfname)
+        else:
+            self.reader = pysam.VariantFile(vcfname, index_filename=index)
+        if popmodel is not None:
+            self.popmodel = popmodel
+            popsamp_list = popmodel.inds
+            self.reader.subset_samples(popsamp_list)
+            self.setPopIdx()
+        if use_allpop:
+            self.setAllPop()
+        if subsamp_list is not None:
+            logging.debug('Subsampling %d individuals from VCF file' %
+            (len(subsamp_list)))
+            self.reader.subset_samples(subsamp_list)
+        self.prev_last_rec = next(self.reader)
+        self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
+
+    def fetch(self, chrom=None, start=None, end=None):
+        return self.reader.fetch(chrom, start, end)
+
+    def getRecordList(self, region=None, chrom=None, start=None,
+                      end=None):
+        if self.reader_uncompressed:
+            ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
+            return ret
+        else:
+            return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
+
+    def setPopIdx(self):
+        self.popkeys = {}
+        sample_names = [l for l in self.reader.header.samples]
+        for p in self.popmodel.pop_list:
+            self.popkeys[p] = []
+            for ind in self.popmodel.ind_dict[p]:
+                self.popkeys[p].append(sample_names.index(ind))
+
+    def close(self):
+        self.reader.close()
+
+    def setAllPop(self):
+        self.popkeys = {'ALL':[]}
+        for i in range(len(self.reader.header.samples)):
+            self.popkeys['ALL'].append(i)
+
+def modChrom(c,vcf_chr):
+    if c is None:
+        return None
+    if vcf_chr and c[:3] != 'chr':
+        return 'chr'+c
+    if not vcf_chr and c[:3] == 'chr':
+        return c[3:]
+    return c
+
+def getRecordList(vcf_reader, region=None, chrom=None, start=None,
+                  end=None, add_chr=False):
+    """Returns list for use in subsampling from input file"""
+    if region is not None:
+        c = modChrom(region.chrom,add_chr)
+        var_sites = vcf_reader.fetch(c, region.start, region.end)
+    else:
+        c = modChrom(chrom,add_chr)
+        var_sites = vcf_reader.fetch(c, start, end)
+    lst = []
+    for rec in var_sites:
+        lst.append(rec)
+    return lst
+
+
+def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
+                          start=None, end=None, add_chr=False):
+    """Method for getting record list from unzipped VCF file.
+
+    This method will sequentially look through a VCF file until it finds
+    the given `start` position on `chrom`, then add all records to a list
+    until the `end` position has been reached. Note that `prev_last_rec`
+    must be kept track of externally to ensure that if consecutive regions
+    are called, the record of the first variant outside the first region
+    is not lost.
+
+    Parameters
+    ----------
+    vcf_reader : pysam VariantFile object
+        VCF reader initialized from other function
+    region : region object
+        Region object with start and end coordinates of region of interest.
+    prev_last_rec : VariantRecord object
+        Variable with last record read from VcfReader. Stored here so that
+        if two adjacent regions are called, the overflow record from the
+        first region is still included in the next region
+
+    Returns
+    -------
+    lst : list
+        List of records in given gene region
+    prev_last_rec : VariantRecord object
+        First record after target region, for use in next call
+
+    """
+    lst = []
+    if region is None:
+        lst.append(prev_last_rec)
+        for rec in vcf_reader:
+            lst.append(rec)
+        return lst, lst[-1]
+
+    if (prev_last_rec is not None and
+        region.containsRecord(prev_last_rec) == 'in'):
+        lst.append(prev_last_rec)
+    elif (prev_last_rec is not None and
+         region.containsRecord(prev_last_rec) == 'after'):
+        return []
+    rec = next(vcf_reader,None)
+    if rec is None:
+        return lst,None
+    place = region.containsRecord(rec)
+    while rec is not None and place != 'after':
+        if place == 'in':
+            lst.append(rec)
+        rec = next(vcf_reader,None)
+        if rec is None:
+            break
+        place = region.containsRecord(rec)
+    prev_last_rec = rec
+    return lst, prev_last_rec
+
+
+def checkRecordIsSnp(rec):
+    """Checks if this record is a single nucleotide variant, returns bool."""
+    if len(rec.ref) != 1:
+        return False
+    if rec.alts is None:
+        return False
+    for allele in rec.alts:
+        if len(allele) != 1:
+            return False
+    return True
+
+
+def getSubsampleList(vcfname, ss_count):
+    """Returns a list of the first `ss_count` individuals in `vcfname`
+
+    """
+
+    vcf_o = pysam.VariantFile(vcfname)
+    rec = next(vcf_o)
+    vcf_o.close()
+    lst = []
+    for samp in rec.samples:
+        lst.append(samp)
+    return lst[:int(ss_count)]
+
+
+def compressVcf(vcfname,forceflag=False,remove=False):
+    """Runs bgzip and tabix on input VCF file.
+
+    Using the pysam library, this function runs the bgzip and tabix utilities
+    on the given input file. By default, this will not overwrite an existing
+    zipped file, but will overwrite an existing index. `remove` can be set to
+    delete the unzipped file.
+
+    Parameters
+    ----------
+    vcfname : str
+        Name of uncompressed VCF file
+    forceflag : bool (False)
+        If true, will overwrite (vcfname).gz if it exists
+    remove : bool (False)
+        If true, will delete uncompressed source file
+
+    Returns
+    -------
+    cvcfname : str
+        Filepath to compressed VCF file
+    """
+    cvcfname = vcfname+".gz"
+    pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
+    pysam.tabix_index(cvcfname,preset="vcf",force=True)
+    if remove:
+        os.remove(vcfname)
+    return cvcfname
+
+def vcfRegionName(prefix, region, ext, oneidx=False,
+                  halfopen=True, sep='-'):
+    chrom = region.toStr(halfopen, oneidx, sep)
+    return prefix+'_'+chrom+'.'+ext
+
+def getRecordsInRegion(region, record_list):
+    sub_list = []
+    for i in range(len(record_list)):
+        loc = region.containsRecord(record_list[i])
+        if loc == "in":
+            sub_list.append(record_list[i])
+        elif loc == "after":
+            break
+    return sub_list
+
+
+
+
+
+#def getVcfReader(args):
+def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
+                 subsamp_fn=None, subsamp_list=None, index=None):
+    """Returns a reader for a given input VCF file.
+
+    Given a filename, filetype, compression option, and optional Subsampling
+    options, will return a pysam.VariantFile object for iteration and
+    a flag as to whether this file is compressed or uncompressed.
+
+    Parameters
+    ----------
+    vcfname : str
+        Filename for VCF file. The extension of this file will be used to
+        determine whether it is compressed or not unless `var_ext` is set.
+    var_ext : str (None)
+        Extension for VCF file if it is not included in the filename.
+    compress_flag : bool (False)
+        If filetype is uncompressed and this is set to true, will run
+        compressVcf function.
+    subsamp_num : int (None)
+        If set, will randomly select `subsamp_num` individuals (not
+        genotypes) from the input VCF file and return a reader with
+        only those data.
+    subsamp_fn : str (None)
+        If set, will return a reader with only data from the samples listed
+        in the file provided. Cannot be used with other subsampling options.
+    subsamp_list : list (None)
+        If set, will return reader with records containing only
+        individuals named in the list. Cannot be used with other subsampling
+        options.
+
+    Returns
+    -------
+    vcf_reader : pysam.VariantFile
+        A reader that can be iterated through for variant records. If
+        compressed, it will be able to use the pysam fetch method, otherwise
+        it must be read through sequentially
+    reader_uncompressed : bool
+        If True, VCF reader is uncompressed. This means the fetch method
+        cannot be used and region access must be done using the
+        "getRecordListUnzipped" method.
+
+    """
+    ext = checkFormat(vcfname)
+    if ext in ['gzip','other'] :
+        raise Exception(('Input file %s is gzip-formatted, must be either '
+                         'uncompressed or zipped with bgzip' % vcfname))
+    file_uncompressed = (ext == 'vcf')
+    reader_uncompressed = (file_uncompressed and not compress_flag)
+    if compress_flag and file_uncompressed:
+        vcfname = compressVcf(vcfname)
+    #subsamp_list = None
+    if subsamp_num is not None:
+        if subsamp_list is not None:
+            raise Exception('Multiple subsampling options called in getVcfReader')
+        subsamp_list = getSubsampleList(vcfname, subsamp_num)
+    elif subsamp_fn is not None:
+        if subsamp_list is not None:
+            raise Exception('Multiple subsampling options called in getVcfReader')
+        subsamp_file = open(subsamp_fn,'r')
+        subsamp_list = [l.strip() for l in subsamp_file.readlines()]
+        subsamp_file.close()
+    if index is None:
+        vcf_reader = pysam.VariantFile(vcfname)
+    else:
+        vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
+    if subsamp_list is not None:
+        logging.debug('Subsampling %d individuals from VCF file' %
+        (len(subsamp_list)))
+        vcf_reader.subset_samples(subsamp_list)
+    return vcf_reader, reader_uncompressed