changeset 0:9127b49793a1 draft default tip

Uploaded
author jaredgk
date Fri, 12 Oct 2018 14:14:52 -0400
parents
children
files four_gamete_pysam.py four_gamete_wrapper.xml gene_region.py logging_module.py vcf_reader_func.py
diffstat 5 files changed, 1900 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/four_gamete_pysam.py	Fri Oct 12 14:14:52 2018 -0400
@@ -0,0 +1,899 @@
+
+"""
+    samples an interval based on four-gamete tests from aligned sequences
+"""
+"""
+    written for python 3.6
+    2 possible kinds of 4 gamete-based intervals:
+        HandK85, minimal parsimonious set of intervals, such that each contains
+            one or more recombination events:
+            each interval has endpoints that are informative sites that are
+            incompatible on the basis of 4gamete criterion
+            each is inferred to contain at least 1 recombination event
+            a set of intervals form a  parsimonious non-overlapping set that
+            are consistent with the data regions not in those intervals show no
+            evidence of recombination
+        CONTIG4GPASS, set of most inclusive intervals, such that all the sites
+            in each are fully compatible with each other:
+            each interval is a stretch of bases that contain zero incompatible
+            sites by the 4 gamete criterion
+            on each side,  the left and right,  the next polymorphic site that
+            is not in the interval must be incompatible with one of the
+            polymorphic sites that is in the interval.
+
+    input filetypes:   FASTA, SITE, VCF
+    kinds of intervals:   HandK85  CONTIG4GPASS (default)
+    kinds of returns:
+            all intervals
+            single interval
+    single interval return options:
+            random interval - uniform over list of intervals
+            randombase interval - prob sampling proportional to interval length
+            leftmost
+            rightmost
+            interval with at least n informative sites:
+                random_n
+                randombase_n
+                leftmost_n
+                rightmost_n
+    Maximum number of informative SNPs is 1000
+        set in buildlistsofsites()
+
+"""
+import sys
+import argparse
+import logging
+from logging_module import initLogger
+import random
+from gene_region import Region, RegionList
+import pysam
+from vcf_reader_func import getRecordList, vcfRegionName, getRecordsInRegion, VcfReader
+
+
+class BaseData():
+
+    def __init__(self, args, format, filename, region=None):
+        self.poslist = []
+        self.seqs = []
+        self.format = format
+        self.header = None
+        self.records = []
+        if format == 'VCF':
+            self.onlysnps = True
+            vcff = VcfReader(filename,index=args.tabix_index)
+            self.allrecords = vcff.getRecordList(region=region, chrom=args.chrom)
+            self.checkVcfRegion()
+            self.header = vcff.reader.header
+            vcff.close()
+        elif format == "FASTA":
+            self.onlysnps = False
+            f = open(filename,"r")
+            seq = ""
+            while True:
+                s = f.readline().strip()
+                if len(s) == 0:
+                    if len(seq) > 0:
+                        self.seqs.append(seq)
+                    break
+                else:
+                    if s[0] == '>':
+                        if len(seq) > 0:
+                            self.seqs.append(seq)
+                        seq = ""
+                    else:
+                        s = s.strip()
+                        if len(s) > 0:
+                            seq += s
+            f.close()
+        elif format == "SIT":
+            self.onlysnps = False
+            f = open(filename,"r")
+            f.readline()
+            while True:
+                s = f.readline()
+                if s[0] != '#':
+                    break
+            numseq = int(s.split()[0])
+            seqlen = int(s.split()[1])
+            s = f.readline().split()
+
+            numnoncode = int(s[1])
+            if numnoncode != seqlen:
+                for i in range(numnoncode):
+                    f.readline()
+            s = f.readline()
+            if s.find("All") < 0:
+                numpops = int(s.strip())
+                for i in range(numpops):
+                    f.readline()
+            ## now should be at beginning of data
+            for i in range(numseq):
+                self.seqs.append(f.readline().strip()[10:])
+            f.close()
+        self.numbases = listcheckallsamelength(self.seqs)
+        logging.info('Sequence length: %d' % self.numbases)
+        self.seqcount = len(self.seqs)
+        logging.info('Sequence count: %d' % self.seqcount)
+        #self.polysites, self.informpolysites, self.cintervals, self.cmat =
+        self.buildlistofsites(args.includesnpmissingdata, args.only2basesnps)
+
+    def buildlistofsites(self, includesnpmissingdata, only2basesnps):
+        MAXINFORMSNPS = 1000
+        #seqcount,numbases,seqs,list_of_positions,ONLYSNPS = getseqlist(basedata.filename,format)
+        # Find polymorphic sites
+        self.polysites = []
+        self.informpolysites = []
+        informpolyseqs = ['']*self.seqcount ## becomes a list of strings of informative sites,  same length as seqs,  not used as of 8/14/2017
+        baseset = set(['a','c','g','t','A','C','G','T'])
+        linesets = []
+        for j in range(self.numbases):
+            ok = True
+            basecount = [0,0,0,0]
+            for k in range(self.seqcount):
+                if self.seqs[k][j].isalpha() == False:
+                    raise Exception ("non-alphabetic character %s"
+                        " at position %d in sequence %d"%(self.seqs[k][j],k,j))
+                if self.seqs[k][j] not in baseset: ## skip sites with missing data
+                    if not includesnpmissingdata:
+                        ok = False
+                if self.seqs[k][j].upper() == 'A' :
+                    basecount[0] = basecount[0] + 1
+                if self.seqs[k][j].upper() == 'C' :
+                    basecount[1] = basecount[1] + 1
+                if self.seqs[k][j].upper() == 'G' :
+                    basecount[2] = basecount[2] + 1
+                if self.seqs[k][j].upper() == 'T' :
+                    basecount[3] = basecount[3] + 1
+            if ok == False:
+                pass
+                # print ("missing data at position " + str(j))
+            else:
+                polybase = 0
+                twobase = 0
+                for bi in range(4):
+                    if basecount[bi] > 0 :
+                        polybase += 1
+                        if basecount[bi] > 1 :
+                            twobase +=  + 1
+                if  only2basesnps == True and polybase > 2 :
+                    pass
+                    # print ("more than two bases at position " + str(j))
+                elif polybase > 1:
+                    self.polysites.append(j+1) ## use regular (start at 1) count
+                    if (twobase > 1):
+                        self.informpolysites.append(j+1)
+                        lineset =[set([]),set([]),set([]),set([])]
+                        for k in range(self.seqcount):
+                            if self.seqs[k][j].upper() == 'A' :
+                                lineset[0].add(k)
+                            if self.seqs[k][j].upper() == 'C' :
+                                lineset[1].add(k)
+                            if self.seqs[k][j].upper() == 'G' :
+                                lineset[2].add(k)
+                            if self.seqs[k][j].upper() == 'T' :
+                                lineset[3].add(k)
+                        linesets.append(lineset)
+                        if len(self.informpolysites) >= MAXINFORMSNPS:
+                            logging.warning('Maximum SNP count of %d has been hit'%(MAXINFORMSNPS))
+                            break
+        logging.info('Informative site count: %d' % len(self.informpolysites))
+        #if len(self.informpolysites) > MAXINFORMSNPS:
+        #    raise Exception("the number of informative snps: %d"
+#                    " exceeds the maximum: %d"%(len(self.informpolysites),MAXINFORMSNPS))
+        c = 0
+        sl = len(linesets)
+        if sl > 1 :
+            self.cmat = []
+            self.cintervals = []
+            for i in range(sl):
+                self.cmat.append([])
+                for j in range(sl):
+                    if i < j :
+                        if self.records[i].chrom != self.records[j].chrom or compat(linesets[i],linesets[j]) == False :
+                            self.cmat[i].append(0)
+                            self.cintervals.append([self.informpolysites[i],self.informpolysites[j]])
+                        else:
+                            self.cmat[i].append(1)
+                    else:
+                        if i==j:
+                            self.cmat[i].append(1)
+                        else:
+                            self.cmat[i].append(-1)
+        #print (self.cintervals)
+
+    #def compatibleintervals(cmat,informpolysites,numbases,
+            #list_of_positions,ONLYSNPS):
+    def compatibleintervals(self):
+        """
+         return a list of intervals
+            CONTIG4GPASS, set of most inclusive intervals, such that all the sites
+                in each are fully compatible with each other:
+                each interval is a stretch of bases that contain zero incompatible
+                sites by the 4 gamete criterion
+                on each side,  the left and right,  the next polymorphic site that
+                is not in the interval must be incompatible
+                with one of the polymorphic sites that is in the interval.
+
+                intervals are based only on informative sites
+                it is possible for intervals could be extended to include flanking
+                non-informative polymorphic sites
+                probably would have little effect in most contexts
+        """
+        tempinformpolysites = list(self.informpolysites)
+        cints = []
+        nps = len(self.informpolysites)
+        cintinformlistfull = []
+        p = 0
+        while p < nps:
+            j = p
+            donej = False
+            cintinform = 1
+            while True:
+                j += 1
+                if j >= nps or self.cmat[p][j] != 1:
+                    j -= 1
+                    donej = True
+                    break
+                else:
+                    for i in range(p+1,j+1):
+                        if self.cmat[i][j] != 1:
+                            donej = True
+                            j -= 1
+                            break
+                if donej:
+                    break
+                else:
+                    cintinform += 1
+            # if j > p:
+            cints.append([tempinformpolysites[p],tempinformpolysites[j]])
+            cintinformlistfull.append(cintinform)
+            p += 1
+        ## now remove all intervals that are overlapped by a larger one
+        cintinformremovelist = []
+        c = len(cints)
+        removelist = []
+        for ai in range(c-1) :
+            for bi in range(ai+1,c) :
+                assert (cints[ai][0] <= cints[ai][1])
+                assert (cints[bi][0] <= cints[bi][1])
+                i = cints[ai][0]
+                j = cints[ai][1]
+                m = cints[bi][0]
+                n = cints[bi][1]
+                if (m <= i)  and (j <= n):
+                    if cints[bi] not in removelist:
+                        removelist.append(cints[ai])
+                        cintinformremovelist.append(ai)
+                if (i <= m)  and (n <= j):
+                    if cints[ai] not in removelist:
+                        removelist.append(cints[bi])
+                        cintinformremovelist.append(bi)
+        for interval in removelist:
+            cints.remove(interval)
+        cintinformlist = []
+        for i in range(c):
+            if i not in cintinformremovelist:
+                cintinformlist.append(cintinformlistfull[i])
+        # now pad intervals with sequences to the flanking ones
+        cintspadded = []
+        if False:
+            cints = replace_positions(cints,self.poslist)
+            return cints,cintinformlist
+        if self.onlysnps:
+        ## using a vcf file,  intervals are 1-based  but positions in list_of_positions are 0-based
+            for ci in range(len(cints)):
+                temp = [-1,-1]
+                if ci == 0:
+                    assert cints[ci][0] > 0
+                    temp[0] = self.poslist[cints[ci][0] - 1]  ## -1 because list_of_positions is 0-based
+                else:
+                    temp[0] = self.poslist[self.informpolysites[self.informpolysites.index(cints[ci][0]) -1] -1] + 1  ## position of previous informative snp + 1
+                if ci == len(cints)-1:
+                    temp[1] = self.poslist[cints[ci][1] -1]
+                else:
+                    temp[1] = self.poslist[self.informpolysites[self.informpolysites.index(cints[ci][1]) + 1] -1] -1   ## position of next informative snp -1
+                cintspadded.append(temp)
+        else:
+            for ci in range(len(cints)):
+                temp = [-1,-1]
+                if ci == 0:
+                    temp[0] = 1
+                else:
+                    temp[0] = self.informpolysites[self.informpolysites.index(cints[ci][0]) -1] + 1   ## position of previous informative snp + 1
+                if ci == len(cints)-1:
+                    temp[1] = numbases
+                else:
+                    temp[1] = self.informpolysites[self.informpolysites.index(cints[ci][1]) + 1] -1    ## position of next informative snp -1
+                cintspadded.append(temp)
+        return cintspadded,cintinformlist
+        #return cints,cintinformlist
+
+    #def hudsonkaplan85intervals(cintervals,numbases,list_of_positions) :
+    def hk85intervals(self):
+        """
+            implements the HK algorithm from appendix 2 of Hudson & Kaplan 1985
+
+            Rm is the number of recombination events that can be
+            parsimoniously inferred from a sample of sequences
+            D is a compatibility matrix d[i,j] == 1 if all 4 gametes are present,
+                 0 otherwise
+            consider for convenience only i<j i.e. above the diagonal
+
+            H&K algorithm prunes D to leave the minimum set of intervals
+            necessary to explain the data
+
+            pruning steps:
+                for two intervals (i,j) and (m,n)  if m <= i< j <= n  remove (m,n)
+
+                for remaining intervals do these steps to remove
+                non disjoint intervals
+                    identify an interval that is not disjoint from all
+                    the others (if none can be found we are done)
+                    call this (i1,j1)
+
+                    now scan all intervals (m,n) and delete
+                    all those for which i1 < m < j2
+
+                    now search other intervals to find (i2,j2) such that i2 >= j1
+                    and this interval is not disjoint from all the
+                    remaining intervals
+                    now using (i2,j2) delete all intervals
+                    whose first component is <j2 and >i2
+        """
+        c = len(self.cintervals)
+        removelist = []
+        for ai in range(c-1) :
+            for bi in range(ai+1,c) :
+                assert (self.cintervals[ai][0] < self.cintervals[ai][1])
+                assert (self.cintervals[bi][0] < self.cintervals[bi][1])
+                i = self.cintervals[ai][0]
+                j = self.cintervals[ai][1]
+                m = self.cintervals[bi][0]
+                n = self.cintervals[bi][1]
+                if (m <= i)  and (j <= n):
+                    if self.cintervals[bi] not in removelist:
+                        removelist.append(self.cintervals[bi])
+                if (i <= m)  and (n <= j):
+                    if self.cintervals[ai] not in removelist:
+                        removelist.append(self.cintervals[ai])
+        for interval in removelist:
+            self.cintervals.remove(interval)
+        c = len(self.cintervals)
+        ai = 0
+        while ai < c-1:
+            removelist = []
+            for bi in range(ai+1,c):
+                i1 = self.cintervals[ai][0]
+                j1 = self.cintervals[ai][1]
+                b = self.cintervals[bi][0]
+                e = self.cintervals[bi][1]
+                if  (i1 < b  < j1) or (b < i1 < e):
+                    removelist.append(self.cintervals[bi])
+            for interval in removelist:
+                self.cintervals.remove(interval)
+
+            i1 = self.cintervals[ai][0]
+            j1 = self.cintervals[ai][1]
+            xi = ai + 1
+            c = len(self.cintervals)
+            while xi < c:
+                if self.cintervals[xi][0] >= j1:
+                    removelist = []
+                    for bi in range(xi+1,c):
+                        i2 = self.cintervals[xi][0]
+                        j2 = self.cintervals[xi][1]
+                        b = self.cintervals[bi][0]
+                        e = self.cintervals[bi][1]
+                        if  (i2 < b  < j2):
+                            removelist.append(self.cintervals[bi])
+                for interval in removelist:
+                    self.cintervals.remove(interval)
+                    c = len(self.cintervals)
+                xi += 1
+            c = len(self.cintervals)
+            ai += 1
+        # reset snp positions if needed
+        if len(self.poslist) > 0:
+            self.cintervals = replace_positions(self.cintervals,self.poslist)
+        return self.cintervals
+
+    def checkVcfRegion(self):
+        """
+        Checks that variants in region are SNPs and creates a list with valid sites. Also sets ploidy and position list of valid sites.
+
+        """
+        valid_bases = ['A','C','G','T']
+        skiplines = []
+        self.ploidy = -1
+        self.poslist = []
+        base_list = []
+        self.records = []
+        for record in self.allrecords:
+            lineok=True
+            for allele in record.alleles:
+                if allele not in valid_bases:
+                    lineok=False
+                    break
+            if not lineok:
+                continue
+            somedata = False
+            #data_present = 0
+            var_list = []
+            #for indiv in record.samples:
+            for i in range(len(record.samples)):
+                indiv = record.samples[i]
+                if self.ploidy == -1:
+                    self.ploidy = len(indiv.alleles)
+                if not somedata and indiv.alleles.count(None) == 0:
+                    somedata = True
+                for samp_allele in indiv.alleles:
+                    if samp_allele is None:
+                        var_list.append('.')
+                    else:
+                        var_list.append(samp_allele)
+            if somedata:
+                base_list.append(var_list)
+                self.poslist.append(record.pos)
+                self.records.append(record)
+        self.seqs = [[base_list[row][col] for row in range(0, len(base_list))] for col in range(0, len(base_list[0]))]
+
+
+def createParser():
+    parser = argparse.ArgumentParser(description=("Given a file of aligned"
+            "sequences or variable sites, will return intervals based"
+            " on 4 gamete tests"))
+    #parser.add_argument("filename", help="Input filename")
+    filetype_group = parser.add_mutually_exclusive_group(required=True)
+    filetype_group.add_argument("--fasta", dest = "fasta",
+            nargs="+", help = "fasta file(s) of"
+            " multiple aligned sequences, all the same length")
+    filetype_group.add_argument("--sit", dest = "sit",
+            nargs="+", help = " SITES format file(s)")
+    filetype_group.add_argument("--vcfreg", dest = "vcfreg", nargs=2,
+            help = "VCF file and region list, in that order")
+    filetype_group.add_argument("--vcfname", dest="vcfname", nargs="+")
+    output_group = parser.add_mutually_exclusive_group()
+    output_group.add_argument("--out", dest="out")
+    output_group.add_argument("--out-prefix", dest="out_prefix")
+    intervaltype_group = parser.add_mutually_exclusive_group(required=
+            "--reti" in sys.argv)
+    intervaltype_group.add_argument("--hk", dest="intervaltype",
+            action='store_const', const="HandK85",help = "get a set of "
+            "parsimonious intervals each containing at least 1 recombination"
+            " event.  Follows appendix 2 of Hudson and Kaplan 1985")
+    intervaltype_group.add_argument("--4gcompat", dest="intervaltype",
+            action='store_const', const="CONTIG4GPASS",help = "get a set of "
+            " intervals such that for each interval all the included bases are"
+            " compatible with each other based on the 4 gamete test")
+    returntype_group = parser.add_mutually_exclusive_group(required=True)
+    returntype_group.add_argument("--reti", dest="returntype",
+            action='store_const', const="returninterval",help = " return single "
+            " interval")
+    returntype_group.add_argument("--retl", dest="returntype",
+            action='store_const', const="returnlist", help = " return a list of"
+            " all intervals")
+    returntype_group = parser.add_mutually_exclusive_group(required=False)
+    returntype_group.add_argument("--rani", dest="intervalpicktype",
+            action='store_const', const="randominterval",default =
+            "leftinterval", help = "return a randomly selected interval"
+            " from the list of all intervals")
+    returntype_group.add_argument("--ranb", dest="intervalpicktype",
+            action='store_const', const="randombase", default="leftinterval",
+            help = "return a single randomly selected interval with probabiliy "
+            "of selection proportional to interval length")
+    returntype_group.add_argument("--left", dest="intervalpicktype",
+            action='store_const', const="leftinterval", default="leftinterval",
+            help = "return the leftmost interval")
+    returntype_group.add_argument("--right", dest="intervalpicktype",
+            action='store_const', const="rightinterval",
+            default="leftinterval",help = "return the rightmost interval")
+    parser.add_argument("--numinf",dest= "numinformativesites",
+            action="store",nargs = 1,type = int, default = None,help =
+            "limit the picking of an interval to only those with at least this"
+            " many informative snps")
+    parser.add_argument("--b2",dest="only2basesnps",action="store_true",
+            default=False,help = "causes snps with more than 2 bases "
+            "to be ignored, default is to include them")
+    parser.add_argument("--incN", dest="includesnpmissingdata",
+            action="store_true", default=False,help="causes snps with missing"
+            " data to be included, default is to ignore them")
+    parser.add_argument("--ranseed", dest="ranseed", type=int,
+            action="store",nargs = 1,default = None, help="positive integer to"
+            " serve as the random number seed "
+            "for use with --ranb or --rani, not required but"
+            " useful when debugging")
+    parser.add_argument("--chrom", dest="chrom", help="Select variants "
+            "from a single specified chromosome")
+    #add checks for correct input type
+    parser.add_argument('--tbi', dest="tabix_index")
+    return parser
+
+def logArgs(args):
+    logging.info('Arguments for vcf_region_write:')
+    for k in vars(args):
+        logging.info('Argument %s: %s' % (k, vars(args)[k]))
+
+def compat(s1, s2):
+    """
+    s1 and s2 are lists of sets of line numbers, each representing a particular
+    polymorphic site.
+    Each list as 4 sets,  one for each base: A,C,G & T.
+    Each set is all the line  numbers that share that particular base.
+    So then a polymorphic site represented by s1 will have
+    two or more nonempty sets in s1.
+    Same for s2
+
+    compat() counts the number of pairs of sets, betweeen s1 and s2,
+    for all possible pairs, for which the size of the intersection is nonzero
+    and less than the size of either set. If this is 4 or more then the
+    two sites are not compatible
+
+    this should work for 2,3 or 4 polymorphic bases in either site
+
+    """
+    count = 0
+    for i1 in range(len(s1)):
+        for i2 in range(len(s2)):
+            if len(s1[i1]) > 1 and len(s2[i2]) > 1:
+                intersect = s1[i1]  & s2[i2]
+                lenintersect = len(intersect)
+                if (lenintersect > 0 and lenintersect < len(s1[i1])
+                    and lenintersect < len(s2[i2])):
+                    count = count + 1
+    if count >= 4 :
+        return False
+    else :
+        return True
+
+def listcheckallsamelength(lcheck):
+    """
+        takes a list of lists or strings and checks to see that
+        they are all the same length returns the length that is shared by all
+    """
+    i = 0
+    for x in lcheck:
+        try:
+            lval = len(x)
+        except TypeError:
+            print('position %d not compatible with len()'%i)
+        if i>0:
+            assert lval == lastlval
+        lastlval = lval
+        i+=1
+    return lval
+
+
+
+
+def replace_positions(intervals,list_of_positions):
+    """
+        with vcf files the base number of snp locations is contained
+        in list_of_positions and the value in intervals[] are simply
+        the index positions of the snps in the vcf files.
+
+        This function rewrites the intervals to reflect the original
+        snp base positions
+    """
+    bintervals = []
+    for iv in intervals:
+        #logging.info('Index: %d %d' % (iv[0],iv[1]))
+        #logging.info('Length: %d' % len(list_of_positions))
+        bintervals.append([list_of_positions[iv[0]-1],list_of_positions[iv[1]-1]+1])
+    return bintervals
+
+
+def sampleinterval(picktype,numinf,intervals,infcounts):
+    """
+        picktype : randominterval, randombase,left,right
+        numinf : integer or None
+        intervals : list of intervals
+        infcounts: list of informative site counts,  or None
+    """
+    if len(intervals) == 0:
+        return None
+
+    #if picktype == "leftinterval":
+    #    return intervals[0]
+    #if picktype == "rightinterval":
+    #    return intervals[-1]
+    if numinf != None:
+        numinf = numinf[0]
+    if numinf != None and infcounts != None:
+        okints = [i for i in range(len(infcounts)) if infcounts[i] >= numinf ]
+    else:
+        okints = [i for i in range(len(intervals))]
+    if picktype == "leftinterval":
+        return intervals[okints[0]]
+    if picktype == "rightinterval":
+        return intervals[okints[-1]]
+    n = len(okints)
+    if n == 0: # no intervals with that many informative sites
+        return None
+    if picktype == "randominterval":
+        return  intervals[ okints[ random.randint(0,n-1)]]
+    if picktype == "randombase":
+        sumbase = 0
+        for i in range(n):
+            sumbase += intervals[okints[i]][1] -  intervals[okints[i]][0] + 1
+        rbn = random.randint(1,sumbase)
+        i = 0
+        while True:
+            if rbn > intervals[okints[i]][1] -  intervals[okints[i]][0] + 1:
+                rbn -= intervals[okints[i]][1] -  intervals[okints[i]][0] + 1
+                i += 1
+            else:
+                break
+        return  intervals[ okints[i]]
+    assert False  # should not get here
+    return None
+
+def getIntervalList(args, basedata):
+    """
+    Returns list of intervals from either hk85 or 4gcompat function.
+    Also controls return based on args provided (list or single interval)
+    """
+
+    if args.intervaltype == "HandK85":
+        #hk code doesn't return counts of the informatics sites in each interval
+        intervals = basedata.hk85intervals()
+        if args.numinformativesites != None:
+            raise Exception("--numinf not implemented when using --hk")
+        intervalinformativecounts = None # not implemented  for hk
+        numinf = None # set to None, hk use of # inform sites not implemented
+
+    else:
+        intervals, intervalinformativecounts = basedata.compatibleintervals()
+
+    if args.returntype == 'returnlist':
+        return intervals
+    else:
+        interval = sampleinterval(args.intervalpicktype,
+                                  args.numinformativesites,
+                                  intervals,intervalinformativecounts)
+        return interval
+
+
+def outputSubregion(args, interval, basedata, region=None, filename=None):
+    #create new region from interval
+    #find records in range of new interval
+    #open file with prefix and region name
+    #write records, close file
+    """
+    Outputs records in identified sub-interval to desired output file
+    """
+    if region is None:
+        subregion = Region(interval[0],interval[1]+1,basedata.records[0].chrom)
+    else:
+        subregion = Region(interval[0],interval[1]+1,region.chrom)
+    subrecords = getRecordsInRegion(subregion, basedata.records)
+    if filename is None:
+        subfn = vcfRegionName(args.out_prefix,subregion,"vcf.gz")
+    else:
+        subfn = filename
+    #print (interval)
+    #print (subfn)
+    subf = pysam.VariantFile(subfn, 'w', header=basedata.header)
+    for record in subrecords:
+        subf.write(record)
+    subf.close()
+
+
+
+
+
+def sample_fourgametetest_intervals(sys_args):
+    """Returns interval(s) from aligned sequences or snps.
+
+    Given a set of aligned sequences or a vcf file, this function
+    returns intervals that have been identified based on four-gamete
+    incombpatibility.   Intervals can either flank a recombination event or
+    be sure not to contain an incombatibility.
+
+
+    Parameters
+    ----------
+
+    Input (one required):
+    --fasta : str
+        Filename(s) of input FASTA file for testing
+    --vcfreg : str, str
+        Filename of input VCF file and BED file with list of
+        regions to run tests on
+    --vcfname : str
+        Filename(s) of VCF files, each corresponding to a region, that will be subsampled for compatible intervals
+    --sit : str
+        Filename(s) of input SITES format files for testing
+
+    Output types:
+    --out : str
+        Name of single output file.
+    --out-prefix : str
+        Prefix for name of output files
+    --out-reg : str
+        Name of file that stores passing interval per interval in input BED file (currently not working)
+
+    Tests (one required)
+    --hk : bool
+        Will find intervals that contain at least one recombination event
+    --4gcompat : bool
+        Will find intervals with zero recombination events in them
+
+    Return options:
+    --reti : bool
+        Output will be of a single region, indicated by rani/ranb/left/right
+    --retl : bool
+        Output will be of all passing regions
+
+    For single region:
+    --left : bool (default)
+        Single output region will be left-most region with enough informative sites
+    --right : bool
+        Output region will be right-most region
+    --rani : bool
+        Region will be randomly selected, with each region having equal chance of selection
+    --ranb : bool
+        Region will be randomly selected, with each region having selection odds proportional to size
+
+    General options:
+    --numinf : int
+        Number of sites in an identified interval required for reporting (default is 1)
+    --incN : bool
+        Include SNPs with missing data (default is to ignore them)
+    --ranseed : int
+        Seed for picking random interval from list
+    --tbi : str
+        Path to tabix index for input VCF file if it's name doesn't match
+
+    """
+    parser = createParser()
+    # parser.print_help()
+    if len(sys_args) == 0:
+        parser.print_help()
+        sys.exit(1)
+    args = parser.parse_args(sys_args)
+    logArgs(args)
+    #argdic = vars(args)
+    interval_list = []
+    multiple_regions = False
+    region_output = False
+    if args.ranseed != None:
+        random.seed(int(args.ranseed[0]))
+    filetype = ''
+    out_list = (args.out is None and args.out_prefix is None)
+
+    if args.vcfreg is not None:
+        filetype = "VCF"
+        vcfname = args.vcfreg[0]
+        regname = args.vcfreg[1]
+        if regname != '-':
+            region_list = RegionList(filename=regname)
+            #region_output = True
+            multiple_regions = (len(region_list.regions) > 1)
+            if multiple_regions and not out_list and args.out is not None:
+                raise Exception(("VCF with multiple regions cannot have "
+                                "single output file (--out)"))
+            for region in region_list.regions:
+                basedata = BaseData(args, "VCF",vcfname, region)
+                intervals = getIntervalList(args, basedata)
+                interval_list.append(intervals)
+                if args.out_prefix is not None:
+                    if args.returntype == 'returnlist':
+                        for iv in intervals:
+                            outputSubregion(args, iv, basedata)
+                    else:
+                        outputSubregion(args, intervals, basedata)
+                elif args.out is not None:
+                    if args.returntype == 'returnlist':
+                        raise Exception('Multiple outputs directed to single file')
+                    outputSubregion(args, intervals, basedata, region=region, filename=args.out)
+
+        else:
+            basedata = BaseData(args, "VCF",vcfname)
+            intervals = getIntervalList(args,basedata)
+            interval_list.append(intervals)
+            if args.out_prefix is not None:
+                if args.returntype == 'returnlist':
+                    for iv in intervals:
+                        outputSubregion(args, iv, basedata)
+                else:
+                    outputSubregion(args, intervals, basedata)
+            elif out_list:
+                return interval_list
+            elif args.out is not None:
+                if args.returntype == 'returnlist':
+                    raise Exception(('If --retl is specified, output must not '
+                                    'be --out'))
+                outputSubregion(args, intervals, basedata, filename=args.out)
+
+    elif args.vcfname is not None:
+        filetype="VCF"
+        if len(args.vcfname) > 1 and args.out_prefix is not None:
+            raise Exception(("Multiple VCFs require --out-prefix flag"))
+        for vcfname in args.vcfname:
+            basedata = BaseData(args, "VCF",vcfname)
+            intervals = getIntervalList(args,basedata)
+            interval_list.append(intervals)
+            if args.out_prefix is not None:
+                if args.returntype == 'returnlist':
+                    for iv in intervals:
+                        outputSubregion(args, iv, basedata)
+                else:
+                    outputSubregion(args, intervals, basedata)
+            elif out_list:
+                return interval_list
+            elif args.out is not None:
+                if args.returntype == 'returnlist':
+                    raise Exception(('If --retl is specified, output must not '
+                                    'be --out'))
+                outputSubregion(args, intervals, basedata, filename=args.out)
+    elif args.fasta is not None:
+        filetype="FASTA"
+        if len(args.fasta) > 1 and args.out_prefix is not None:
+            raise Exception(("Multiple FASTAs require --out-prefix flag"))
+        for fastaname in args.fasta:
+            basedata = BaseData(args, "FASTA", fastaname)
+            intervals = getIntervalList(args, basedata)
+            interval_list.append(intervals)
+    elif args.sit is not None:
+        filetype="SIT"
+        if len(args.sit) > 1 and args.out_prefix is not None:
+            raise Exception(("Multiple SITES files require --out-prefix flag"))
+        if args.out_reg is not None:
+            raise Exception(("SITES files cannot be output to region file"))
+        for sitname in args.sit:
+            intervals = getIntervalList(args, "SIT", sitname)
+            interval_list.append(intervals)
+
+
+
+
+    return interval_list
+        #
+
+
+if __name__ == '__main__':
+    initLogger()
+
+    ### some testing command lines
+    #sample_fourgametetest_intervals([])
+    #exit()
+    if len(sys.argv) > 1:
+        print (sample_fourgametetest_intervals(sys.argv[1:]))
+        exit(0)
+    testlist = []
+    vcf_name = "example/chr11subsamples4gtest.vcf.gz"
+    fasta_name = "example/fourgfastatest.fasta"
+    testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name,
+                      "-", "--4gcompat", "--reti", "--ranseed", "123"])
+    testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name,
+                      "-", "--hk", "--retl"])
+    testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name,
+                      "-", "--hk", "--reti","--left"])
+    testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name,
+                      "-", "--4gcompat", "--retl"])
+    testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name,
+                     "-", "--4gcompat", "--reti","--ranseed","123","--ranb"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name,
+                      "--4gcompat", "--retl"])
+    testlist.append( ["test #" + str(len(testlist)),"--fasta", fasta_name,
+                      "--4gcompat", "--retl","--incN","--b2"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name,
+                      "--hk","--retl"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--hk",
+            "--retl","--incN","--b2"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--hk",
+            "--reti","--incN","--left"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat",
+            "--reti","--incN","--numinf","2","--ranb"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat",
+            "--reti","--incN","--numinf","2","--rani"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat",
+            "--reti","--incN","--numinf","4","--ranb"])
+    testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat",
+            "--reti","--incN","--numinf","100","--ranb"])
+    print ("%d test jobs"%(len(testlist))                   )
+    for tl in testlist:
+        sys.argv = tl
+        a = sample_fourgametetest_intervals(sys.argv[1:])
+        print(sys.argv[0],a)
+    exit()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/four_gamete_wrapper.xml	Fri Oct 12 14:14:52 2018 -0400
@@ -0,0 +1,54 @@
+<tool id="fourgamete" name="Four Gamete Test" version="0.1">
+    <requirements>
+        <requirement type="package">pysam</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" />
+    </stdio>
+    <description>Tests a VCF file for a subregion that passes four gamete test</description>
+    <command>
+        <![CDATA[
+            python $__tool_directory__/four_gamete_pysam.py
+            --vcfname $input
+            --out $output
+            --4gcompat
+            --reti
+            #if str($select_arg) == 'left':
+            --left
+            #elif str($select_arg) == 'right':
+            --right
+            #elif str($select_arg) == 'randomi':
+            --rani
+            #else:
+            --ranb
+            #end if
+            --numinf $infsites
+            #if  $input.is_of_type('vcf_bgzip')
+                --tbi $input.metadata.tabix_index
+            #end if
+
+        ]]>
+
+    </command>
+    <inputs>
+        <param format="vcf,vcf_bgzip,bcf" name="input" type="data" label="VCF input" />
+        <param type="integer" name="infsites" label="Number of informative sites required in subregion" value="2" min="0" />
+        <param name="select_arg" type="select" label="Subregion to Select">
+            <option value="left">Left-most</option>
+            <option value="right" selected="True">Right-most</option>
+            <option value="randomi">Random (equal probability per region)</option>
+            <option value="randomb">Random (probability equal to size)</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="vcf" name="output"/>
+    </outputs>
+    <help>
+    <![CDATA[
+
+    Input VCF will be examined for regions that pass the four-gamete test, which indicates there have been no recombination in this region for this sample. For most cases, a failure of the four-gamete test occurs when two SNPs have four gametes between them. (00,01,10,11) Using selection criteria, a region with the target number of informative SNPs that passes the four-gamete criteria will be output to a VCF file.
+
+    ]]>
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_region.py	Fri Oct 12 14:14:52 2018 -0400
@@ -0,0 +1,376 @@
+import sys
+import re
+import logging
+from functools import total_ordering
+from random import shuffle
+
+
+def getChromKey(chrom):
+    """Get key from chromosome string so that it can be naturally sorted
+    (int < string, numbers grouped together)
+    """
+    c = chrom
+    if chrom[0:3] == 'chr':
+        c = chrom[3:]
+    convert = lambda text: int(text) if text.isdigit() else text.lower()
+    k = [ convert(i) for i in re.split('([0-9]+)',c) if len(i) != 0]
+    return k
+
+def keyComp(k1,k2):
+    """Returns True if k1 < k2, False if not. int < str"""
+    for i in range(min(len(k1),len(k2))):
+        e1 = k1[i]
+        e2 = k2[i]
+        if isinstance(e1,int) != isinstance(e2,int):
+            return isinstance(e1,int)
+        if e1 != e2:
+            return e1 < e2
+    return len(k1) < len(k2)
+
+def setRegionSort(sorttype,sortlist=None):
+    valid_types = ['natural','string','list']
+    if sorttype not in valid_types:
+        raise Exception("Sorting type %s is not valid" % (sorttype))
+    Region.sort_method = sorttype
+    if sorttype == 'list' and sortlist is not None:
+        setSortOrder(sortlist)
+
+def setSortOrder(sortlist,trimchrom=False):
+    hold_list = []
+    for c in sortlist:
+        if trimchrom and c[0:3] == 'chr':
+            hold_list.append(c[3:])
+        else:
+            hold_list.append(c)
+    Region.sort_order = hold_list
+
+@total_ordering
+class Region:
+    sort_method = "natural"
+    sort_order = None
+
+    def __init__(self, start, end, chrom):
+        """Zero-based, half open coordinates and chromosome info for
+        a region in a genome. Coords will be formatted according to
+        flags passed to RegionList, then stored in a single format.
+        """
+        self.start = start
+        self.end = end
+        self.chrom = chrom
+        #if chrom[0:3] == 'chr':
+        #    self.chrom = chrom[3:]
+        #else:
+        #    self.chrom = chrom
+    def __deepcopy__(self):
+        return Region(self.start,self.end,self.chrom)
+
+    def cloneRegion(self):
+        return Region(self.start,self.end,self.chrom)
+
+
+    def getChromKey(self):
+        """Splits chromosone name for natural sort. Ints and strs are
+        grouped with adjacent elements of the same type
+
+        Example: 'front88place' returns ['front',88,'place']
+        """
+        return getChromKey(self.chrom)
+
+
+    def __eq__(self, other):
+        return ((self.start == other.start) and (self.end == other.end)
+                and (self.chrom == other.chrom))
+
+    def __lt__(self, other):
+        """Sort key order: chrom-key, start position, end position
+        """
+        if Region.sort_method == 'natural':
+            k1 = self.getChromKey()
+            k2 = other.getChromKey()
+            if k1 != k2:
+                return keyComp(k1,k2)
+        elif Region.sort_method == "string":
+            #k1 = self.chrom
+            #k2 = other.chrom
+            k1 = (self.chrom if self.chrom[:3] != 'chr' else self.chrom[3:])
+            k2 = (other.chrom if other.chrom[:3] != 'chr' else other.chrom[3:])
+            if k1 != k2:
+                return k1 < k2
+        else:
+            if Region.sort_order is None:
+                raise Exception("Sorting by list requires calling setSortOrder() and providing a list for the order")
+            k1 = self.chrom
+            k2 = other.chrom
+            if k1 not in Region.sort_order:
+                raise Exception("Key %s is not in sort order list" % (k1))
+            if k2 not in Region.sort_order:
+                raise Exception("Key %s is not in sort order list" % (k2))
+            if k1 != k2:
+                return Region.sort_order.index(k1) < Region.sort_order.index(k2)
+        if self.start != other.start:
+            return self.start < other.start
+        return self.end < other.end
+
+    def __len__(self):
+        return self.end-self.start
+
+    def containsRecord(self, rec):
+        k1 = self.getChromKey()
+        kr = getChromKey(rec.chrom)
+        if k1 != kr:
+            if keyComp(k1,kr):
+                return 'before'
+            return 'after'
+        comp_pos = rec.pos-1
+        if comp_pos < self.start:
+            return 'before'
+        if comp_pos >= self.end:
+            return 'after'
+        return 'in'
+
+    def toStr(self, zeroho=False, zeroclosed=False, sep=':'):
+        start = self.start
+        end = self.end
+        if not zeroho:
+            start += 1
+        elif zeroclosed:
+            end -= 1
+        return str(self.chrom)+sep+str(start)+sep+str(end)
+        #return str(start)+sep+str(end)+sep+str(self.chrom)
+
+    def getReference(self, refseq):
+        return refseq.fetch(self.chrom,self.start,self.end)
+
+
+
+
+class RegionList:
+
+    def __init__(self, filename=None, genestr=None, reglist=None,
+                 zeroclosed=False, zeroho=False,
+                 colstr=None, sortlist=True, checkoverlap=None,
+                 sortmethod=None, sortorder=None, chromfilter=None,
+                 list_template=None, randomize=False):
+        """Class for storing gene region information
+
+        Will either read a file or take a single gene region in a string
+        as input, then create an object with some metadata and a list of
+        gene regions.
+
+        Parameters
+        ----------
+        filename : str (None)
+            Name of file with gene coordinate data
+        genestr : str (None)
+            Semicolon-separated list of region data, either in "start:end"
+            or "start:end:chrom" format.
+        oneidx : bool (False)
+            If true, region will be read in as a one-index based string.
+            This results in one position being subtracted from the start
+            and end coordinates
+        colstr : str (None)
+            If reading in from a file with many columns, will indicate
+            what columns hold start/end data and chrom data if three
+            items are given.
+        defaultchrom : str (None)
+            If no chromosome data is provided in the region file or string,
+            will set the chromosome to this value
+        halfopen : bool (True)
+            If true, the region range will be [start,end), so the start
+            coordinate will be included in the region but the end
+            will not
+
+        Exceptions
+        ----------
+        Both filename and genestr are None
+        colstr has less than 2 or more than 3 values
+
+        """
+        self.collist = None
+        if list_template is not None:
+            zeroho = list_template.zeroho
+            zeroclosed = list_template.zeroclosed
+            sortlist = list_template.sortlist
+            checkoverlap = list_template.checkoverlap
+            sortorder = list_template.sortorder
+
+        if self.collist is None:
+            if colstr is None:
+                self.collist = [1, 2, 0]
+            else:
+                self.parseCols(colstr)
+        if filename is None and genestr is None and reglist is None:
+            raise Exception(("A filename, region string, or list of "
+                             "region strings must be provided for " "creating a gene region list"))
+        if [filename,genestr,reglist].count(None) < 2:
+            raise Exception(("Only one of filename, region string, and "
+                            "region list can be specified for init"))
+        if zeroho and zeroclosed:
+            raise Exception(("Zero-halfopen and zero-closed options "
+                             "cannot be invoked at the same time"))
+
+        if sortlist and randomize:
+            raise Exception("Sorting and randomizing are not compatible options for RegionList")
+        if checkoverlap not in [None,'fix','error']:
+            raise Exception("Invalid checkoverlap value: %s" % (checkoverlap))
+        self.regions = []
+        self.zeroho = zeroho
+        self.zeroclosed = zeroclosed
+        self.chromfilter = chromfilter
+        if sortmethod is not None:
+            setRegionSort(sortmethod,sortlist=sortorder)
+        if filename is not None:
+            self.initFile(filename)
+        elif genestr is not None:
+            self.initStr(genestr)
+        else:
+            self.initList(reglist)
+
+        if sortlist:
+            self.regions.sort()
+        if checkoverlap is not None:
+            if self.hasOverlap():
+                if checkoverlap == "fix":
+                    self.fixOverlap()
+                elif checkoverlap == "error":
+                    raise Exception("Region overlap detected")
+
+        if randomize:
+            shuffle(self.regions)
+
+    def initFile(self,filename):
+        """Initialize RegionList with a region file
+        """
+        with open(filename, 'r') as regionfile:
+            for line in regionfile:
+                if line[0] == '#':
+                    continue
+                la = line.strip().split()
+                self.initRegion(la)
+
+
+    def initStr(self,genestr):
+        la = genestr.split(':')
+        self.initRegion(la)
+
+
+    def initList(self,reglist):
+        for reg in reglist:
+            self.initRegion(reg)
+
+
+
+    def initRegion(self,la):
+        start = int(la[self.collist[0]])
+        end = int(la[self.collist[1]])
+        chrom = la[self.collist[2]]
+        if self.chromfilter is not None and chrom != self.chromfilter:
+            return
+        if not self.zeroho:
+            start -= 1
+        elif self.zeroclosed:
+            end += 1
+        self.regions.append(Region(start,end,chrom))
+
+
+    def parseCols(self,cols):
+        col_list = [int(i) for i in cols.split(',')]
+        if len(col_list) != 3:
+            raise Exception(("Column string %s must have three "
+                             "values" % (cols)))
+        self.collist = col_list
+
+    def toStr(self, idx, sep=':'):
+        return self.regions[idx].toStr(zeroho=self.zeroho,
+                                       zeroclosed=self.zeroclosed,
+                                       sep=sep)
+
+    def hasOverlap(self):
+        region_hold = sorted(self.regions)
+        for i in range(len(region_hold)-1):
+            if region_hold[i].chrom == region_hold[i+1].chrom:
+                if (region_hold[i].start <= region_hold[i+1].start
+                    < region_hold[i].end):
+                    logging.warning("Regions %s and %s overlap" %
+                                    (self.toStr(i),self.toStr(i+1)))
+                    return True
+        return False
+
+    def fixOverlap(self):
+        region_hold = []
+        self.regions.sort()
+        i = 0
+        temp_reg = None
+        for i in range(len(self.regions)):
+            if temp_reg is None:
+                temp_reg = self.regions[i].cloneRegion()
+                continue
+            cur_reg = self.regions[i].cloneRegion()
+            if cur_reg.start < temp_reg.end and cur_reg.chrom == temp_reg.chrom:
+                temp_reg.end = max(temp_reg.end,cur_reg.end)
+            else:
+                region_hold.append(temp_reg)
+                temp_reg = cur_reg
+        region_hold.append(temp_reg)
+        self.regions = region_hold
+
+    def printList(self, file_handle=None, file_name=None,
+                  return_str=False, delim="\t", add_chr=False):
+        if file_handle is None and file_name is None:
+            file_handle = sys.stdout
+            #raise Exception("Either file_handle or file_name must be set")
+        if file_name is not None:
+            file_handle = open(file_name, 'w')
+        if return_str:
+            out_str = ''
+        for region in self.regions:
+            start = region.start
+            end = region.end
+            if not self.zeroho:
+                start += 1
+            elif self.zeroclosed:
+                end -= 1
+            chrom = region.chrom
+            if add_chr:
+                chrom = "chr"+region.chrom
+            reg_str = chrom+delim+str(start)+delim+str(end)+'\n'
+            if return_str:
+                out_str += reg_str
+            else:
+                file_handle.write(reg_str)
+        if return_str:
+            return out_str
+
+    def filterByChrom(self,chrom_list,include=False):
+        if include:
+            self.regions = [r for r in self.regions if r.chrom in chrom_list]
+        else:
+            self.regions = [r for r in self.regions if r.chrom not in chrom_list]
+
+    def filterOutXY(self):
+        self.filterByChrom(['X','Y','chrX','chrY'])
+
+    #TO do:
+    #Add sort method for strictly text based sorting
+
+def getIntervalsBetween(region_list, padding=0, firstline=True):
+    region_hold = []
+    if len(region_list.regions) < 2:
+        raise Exception("Region list for complement requires at least two regions")
+    if firstline:
+        r1 = region_list.regions[0]
+        if r1.start - padding > 0:
+            region_hold.append([str(r1.chrom),0,r1.start-padding])
+    for i in range(len(region_list.regions)-1):
+        r1 = region_list.regions[i]
+        rn = region_list.regions[i+1]
+        if r1.chrom != rn.chrom:
+            continue
+        new_start = r1.end+padding
+        new_end = rn.start-padding
+        if new_start < new_end:
+            region_hold.append([str(r1.chrom),new_start,new_end])
+    out_list = RegionList(reglist=region_hold, zeroho=True)
+    out_list.zeroho = region_list.zeroho
+    out_list.zeroclosed = region_list.zeroclosed
+    return out_list
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/logging_module.py	Fri Oct 12 14:14:52 2018 -0400
@@ -0,0 +1,97 @@
+import sys
+import logging
+
+
+
+def initLogger(filename='pipeline.log', filelevel='INFO',
+               streamlevel='WARNING', resetlog=True):
+    """General logger initialization for PPP functions.
+
+    Messages from WARNING level and higher will be logged to file and to
+    stderr. By default, INFO level will also be written to logfile. Both
+    levels are flexible.
+
+    Level groupings:
+        ERROR: Error messages should be generated by an exception call
+        WARNING: Non-terminal behavior that may be unusual (i.e. lists
+            with no values, blank strings)
+        INFO: Every major (user-facing) function should have the following:
+            -Message for function start
+            -List of input arguments and options
+            -Basic sanity checks (dimensions of input data)
+            -Statements before or after major function calls
+            -Message for function end
+        DEBUG: Mainly for developer use/debugging. Generate logs for
+            sub-functions that match INFO level for major functions.
+            Possible care should be used if there are lots of loops called.
+
+    Use: Call with either the individual function (in __name__=="__main__"
+    statement) or in pipeline file.
+
+    Parameters
+    ----------
+    filename : str ("pipeline.log")
+        Name of file that log will be written to
+
+    filelevel : {'INFO','DEBUG','WARNING','ERROR'}
+        Set minimum level of log messages that are written to log file.
+        Note that this acts as a de facto minumum for 'streamlevel' as well.
+
+    streamlevel : {'WARNING','DEBUG','INFO','ERROR'}
+        Set minimum level of log messages that are output to stream.
+
+    resetlog : bool (True)
+        If true, will overwrite logfile when opening. Set to false if log is
+        being initialized multiple times
+
+    Returns
+    -------
+    None
+
+    Exceptions
+    ----------
+    If filelevel or streamlevel are not a valid logging level
+
+    """
+    log_levels = ['DEBUG','INFO','WARNING','ERROR']
+    if filelevel is not None and filelevel.upper() not in log_levels:
+        raise Exception('filelevel value %s is not a valid level' %
+                         filelevel)
+    if streamlevel is not None and streamlevel.upper() not in log_levels:
+        raise Exception('streamlevel value %s is not a valid level' %
+                         streamlevel)
+    fmt_def = "%(asctime)s - %(funcName)s - %(levelname)s: %(message)s"
+    fmt_notime = "%(funcName)s - %(levelname)s: %(message)s"
+    fmtr = logging.Formatter(fmt=fmt_def)
+    fmtr_notime = logging.Formatter(fmt=fmt_notime)
+    filelogger = logging.getLogger()
+    filelogger.setLevel('INFO')
+    if streamlevel is not None:
+        s_handler = logging.StreamHandler()
+        s_handler.setFormatter(fmtr_notime)
+        s_handler.setLevel(streamlevel)
+        filelogger.addHandler(s_handler)
+    logmode = 'a'
+    if resetlog:
+        logmode = 'w'
+    if filelevel is not None:
+        f_handler = logging.FileHandler(filename,mode=logmode)
+        f_handler.setFormatter(fmtr)
+        f_handler.setLevel(filelevel)
+        #filelogger.setLevel(filelevel)
+        filelogger.addHandler(f_handler)
+    #Formats exception messages to be sent to appropriate loggers
+    def exp_handler(etype,val,tb):
+        logging.error("%s" % (val), exc_info=(etype,val,tb))
+
+    sys.excepthook = exp_handler
+
+
+def logArgs(args, func_name=None, print_nones=False):
+    header = "Arguments"
+    if func_name is not None:
+        header+=" for"+func_name
+    for arg in vars(args):
+        val = vars(args)[arg]
+        if val is not None or print_nones:
+            logging.info('Argument %s: %s' % (arg,val))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_reader_func.py	Fri Oct 12 14:14:52 2018 -0400
@@ -0,0 +1,474 @@
+import sys
+import pysam
+import logging
+import struct
+from random import sample
+from collections import defaultdict
+import os
+import gzip
+
+def checkIfGzip(filename):
+    try:
+        gf = gzip.open(filename)
+        gl = gf.readline()
+        gf.close()
+        vcf_check = b'##fileformat=VCF'
+        if gl[0:3] == b'BCF':
+            return 'bcf'
+        elif gl[:len(vcf_check)] == vcf_check:
+            return checkHeader(filename)
+        else:
+            return 'other'
+    except:
+        return 'nozip'
+
+def checkHeader(filename):
+    f = open(filename,'rb')
+    l = f.readline()
+    f.close()
+    BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
+    #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
+    GZF_HEADER=b'\x1f\x8b'
+    if l[:len(BGZF_HEADER)] == BGZF_HEADER:
+        return 'bgzip'
+    if l[:len(GZF_HEADER)] == GZF_HEADER:
+        return 'gzip'
+    return 'nozip'
+
+def checkFormat(vcfname):
+    """Checks header of given file for compression type
+
+
+    Given a filename, opens file and reads first line to check if
+    file has BGZF or GZIP header. May be extended to check for BCF format
+
+    Parameters
+    ----------
+    filename : str
+        Name of file to be checked
+
+    Returns
+    -------
+    extension : str {'bgzip','gzip','vcf','other'}
+        File extension as indicated by header
+
+    """
+    typ = checkIfGzip(vcfname)
+    if typ != 'nozip':
+        return typ
+    f = open(vcfname)
+    l = f.readline()
+    f.close()
+    VCF_TAG='##fileformat=VCF'
+    if l[:len(VCF_TAG)] == VCF_TAG:
+        return 'vcf'
+    return 'other'
+
+def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
+    dr = None
+    pos = record.pos
+    c = record.chrom
+    if record.alts is None:
+        return False
+    if add_chr:
+        c = 'chr'+record.chrom
+    if record.ref == 'C' and 'T' in record.alts:
+        seq = fasta_ref.fetch(c,pos-1,pos+1)
+        if seq[0].upper() != 'C':
+            logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
+            #raise Exception("checkIfCpG function not lining up properly")
+        if seq[1].upper() == 'G':
+            return True
+        return False
+    elif record.ref == 'G' and 'A' in record.alts:
+        seq = fasta_ref.fetch(c,pos-2,pos)
+        if seq[1].upper() != 'G':
+            logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
+            #raise Exception("checkIfCpg function not lining up on negative strand")
+        if seq[0].upper() == 'C':
+            return True
+        return False
+    return False
+
+def checkForDuplicates(rec_list,pass_list):
+    for i in range(len(rec_list)-1):
+        if rec_list[i].pos == rec_list[i+1].pos:
+            pass_list[i] = False
+            pass_list[i+1] = False
+
+def checkForMultiallele(rec_list,pass_list):
+    for i in range(len(rec_list)):
+        if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
+            pass_list[i] = False
+            pass_list[i+1] = False
+        if len(rec_list[i].alleles) > 2:
+            pass_list[i] = False
+
+def flipChrom(chrom):
+    if chrom[0:3] == 'chr':
+        return chrom[0:3]
+    return 'chr'+chrom
+
+def getAlleleCountDict(rec):
+    alleles = defaultdict(int)
+    total_sites = 0
+    missing_inds = 0
+    for j in range(len(rec.samples)):
+        samp = rec.samples[j]
+        if None in samp.alleles:
+            missing_inds += 1
+        for k in range(len(samp.alleles)):
+            b = samp.alleles[k]
+            if b is not None:
+                alleles[b] += 1
+            total_sites+=1
+    return alleles, total_sites, missing_inds
+
+def isInformative(rec, mincount=2, alleles=None):
+    count = 0
+    if alleles is None:
+        alleles, total_sites, missing_inds = getAlleleCountDict(rec)
+    if len(alleles) != 2:
+        return False
+    i1,i2 = alleles.keys()
+    return (alleles[i1] >= mincount and alleles[i2] >= mincount)
+
+def getPassSites(record_list, remove_cpg=False, remove_indels=True,
+                 remove_multiallele=True, remove_missing=0,
+                 inform_level=2, fasta_ref=None):
+    pass_list = [True for r in record_list]
+    if remove_cpg == True and fasta_ref is None:
+        raise Exception("CpG removal requires a reference")
+    if inform_level > 2 or inform_level < 0:
+        raise Exception("Inform level %d must be between 0 and 2" % inform_level)
+    if remove_multiallele:
+        checkForMultiallele(record_list,pass_list)
+    for i in range(len(record_list)):
+        rec = record_list[i]
+        logging.info(rec.pos)
+        if remove_indels and not checkRecordIsSnp(rec):
+            pass_list[i] = False
+        if remove_cpg and checkIfCpG(rec,fasta_ref):
+            pass_list[i] = False
+        alleles,total_sites,missing_inds = getAlleleCountDict(rec)
+        if remove_missing != -1 and missing_inds > remove_missing:
+            pass_list[i] = False
+        if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
+            pass_list[i] = False
+    #pp = zip([r.pos for r in record_list],pass_list)
+    #for ppp in pp:
+    #    logging.info(ppp)
+    return pass_list
+
+
+def filterSites(record_list, remove_cpg=False, remove_indels=True,
+                remove_multiallele=True, remove_missing=0, inform_level=2,
+                fasta_ref=None):
+    pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
+    out_list = []
+    for i in range(len(pass_list)):
+        if pass_list[i]:
+            out_list.append(record_list[i])
+    return out_list
+
+
+
+class VcfReader():
+    def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
+                 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
+
+        ext = checkFormat(vcfname)
+        if ext in ['gzip','other'] :
+            raise Exception(('Input file %s is gzip-formatted, must be either '
+                             'uncompressed or zipped with bgzip' % vcfname))
+        self.file_uncompressed = (ext == 'vcf')
+        self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
+        self.popmodel = None
+        self.popkeys = None
+        if popmodel is not None and use_allpop:
+            raise Exception("Popmodel and allpop cannot both be specified")
+        if compress_flag and file_uncompressed:
+            vcfname = compressVcf(vcfname)
+        if subsamp_num is not None:
+            if subsamp_list is not None:
+                raise Exception('Multiple subsampling options called in getVcfReader')
+            subsamp_list = getSubsampleList(vcfname, subsamp_num)
+        elif subsamp_fn is not None:
+            if subsamp_list is not None:
+                raise Exception('Multiple subsampling options called in getVcfReader')
+            subsamp_file = open(subsamp_fn,'r')
+            subsamp_list = [l.strip() for l in subsamp_file.readlines()]
+            subsamp_file.close()
+
+        if index is None:
+            self.reader = pysam.VariantFile(vcfname)
+        else:
+            self.reader = pysam.VariantFile(vcfname, index_filename=index)
+        if popmodel is not None:
+            self.popmodel = popmodel
+            popsamp_list = popmodel.inds
+            self.reader.subset_samples(popsamp_list)
+            self.setPopIdx()
+        if use_allpop:
+            self.setAllPop()
+        if subsamp_list is not None:
+            logging.debug('Subsampling %d individuals from VCF file' %
+            (len(subsamp_list)))
+            self.reader.subset_samples(subsamp_list)
+        self.prev_last_rec = next(self.reader)
+        self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
+
+    def fetch(self, chrom=None, start=None, end=None):
+        return self.reader.fetch(chrom, start, end)
+
+    def getRecordList(self, region=None, chrom=None, start=None,
+                      end=None):
+        if self.reader_uncompressed:
+            ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
+            return ret
+        else:
+            return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
+
+    def setPopIdx(self):
+        self.popkeys = {}
+        sample_names = [l for l in self.reader.header.samples]
+        for p in self.popmodel.pop_list:
+            self.popkeys[p] = []
+            for ind in self.popmodel.ind_dict[p]:
+                self.popkeys[p].append(sample_names.index(ind))
+
+    def close(self):
+        self.reader.close()
+
+    def setAllPop(self):
+        self.popkeys = {'ALL':[]}
+        for i in range(len(self.reader.header.samples)):
+            self.popkeys['ALL'].append(i)
+
+def modChrom(c,vcf_chr):
+    if c is None:
+        return None
+    if vcf_chr and c[:3] != 'chr':
+        return 'chr'+c
+    if not vcf_chr and c[:3] == 'chr':
+        return c[3:]
+    return c
+
+def getRecordList(vcf_reader, region=None, chrom=None, start=None,
+                  end=None, add_chr=False):
+    """Returns list for use in subsampling from input file"""
+    if region is not None:
+        c = modChrom(region.chrom,add_chr)
+        var_sites = vcf_reader.fetch(c, region.start, region.end)
+    else:
+        c = modChrom(chrom,add_chr)
+        var_sites = vcf_reader.fetch(c, start, end)
+    lst = []
+    for rec in var_sites:
+        lst.append(rec)
+    return lst
+
+
+def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
+                          start=None, end=None, add_chr=False):
+    """Method for getting record list from unzipped VCF file.
+
+    This method will sequentially look through a VCF file until it finds
+    the given `start` position on `chrom`, then add all records to a list
+    until the `end` position has been reached. Note that `prev_last_rec`
+    must be kept track of externally to ensure that if consecutive regions
+    are called, the record of the first variant outside the first region
+    is not lost.
+
+    Parameters
+    ----------
+    vcf_reader : pysam VariantFile object
+        VCF reader initialized from other function
+    region : region object
+        Region object with start and end coordinates of region of interest.
+    prev_last_rec : VariantRecord object
+        Variable with last record read from VcfReader. Stored here so that
+        if two adjacent regions are called, the overflow record from the
+        first region is still included in the next region
+
+    Returns
+    -------
+    lst : list
+        List of records in given gene region
+    prev_last_rec : VariantRecord object
+        First record after target region, for use in next call
+
+    """
+    lst = []
+    if region is None:
+        lst.append(prev_last_rec)
+        for rec in vcf_reader:
+            lst.append(rec)
+        return lst, lst[-1]
+
+    if (prev_last_rec is not None and
+        region.containsRecord(prev_last_rec) == 'in'):
+        lst.append(prev_last_rec)
+    elif (prev_last_rec is not None and
+         region.containsRecord(prev_last_rec) == 'after'):
+        return []
+    rec = next(vcf_reader,None)
+    if rec is None:
+        return lst,None
+    place = region.containsRecord(rec)
+    while rec is not None and place != 'after':
+        if place == 'in':
+            lst.append(rec)
+        rec = next(vcf_reader,None)
+        if rec is None:
+            break
+        place = region.containsRecord(rec)
+    prev_last_rec = rec
+    return lst, prev_last_rec
+
+
+def checkRecordIsSnp(rec):
+    """Checks if this record is a single nucleotide variant, returns bool."""
+    if len(rec.ref) != 1:
+        return False
+    if rec.alts is None:
+        return False
+    for allele in rec.alts:
+        if len(allele) != 1:
+            return False
+    return True
+
+
+def getSubsampleList(vcfname, ss_count):
+    """Returns a list of the first `ss_count` individuals in `vcfname`
+
+    """
+
+    vcf_o = pysam.VariantFile(vcfname)
+    rec = next(vcf_o)
+    vcf_o.close()
+    lst = []
+    for samp in rec.samples:
+        lst.append(samp)
+    return lst[:int(ss_count)]
+
+
+def compressVcf(vcfname,forceflag=False,remove=False):
+    """Runs bgzip and tabix on input VCF file.
+
+    Using the pysam library, this function runs the bgzip and tabix utilities
+    on the given input file. By default, this will not overwrite an existing
+    zipped file, but will overwrite an existing index. `remove` can be set to
+    delete the unzipped file.
+
+    Parameters
+    ----------
+    vcfname : str
+        Name of uncompressed VCF file
+    forceflag : bool (False)
+        If true, will overwrite (vcfname).gz if it exists
+    remove : bool (False)
+        If true, will delete uncompressed source file
+
+    Returns
+    -------
+    cvcfname : str
+        Filepath to compressed VCF file
+    """
+    cvcfname = vcfname+".gz"
+    pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
+    pysam.tabix_index(cvcfname,preset="vcf",force=True)
+    if remove:
+        os.remove(vcfname)
+    return cvcfname
+
+def vcfRegionName(prefix, region, ext, oneidx=False,
+                  halfopen=True, sep='-'):
+    chrom = region.toStr(halfopen, oneidx, sep)
+    return prefix+'_'+chrom+'.'+ext
+
+def getRecordsInRegion(region, record_list):
+    sub_list = []
+    for i in range(len(record_list)):
+        loc = region.containsRecord(record_list[i])
+        if loc == "in":
+            sub_list.append(record_list[i])
+        elif loc == "after":
+            break
+    return sub_list
+
+
+
+
+
+#def getVcfReader(args):
+def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
+                 subsamp_fn=None, subsamp_list=None, index=None):
+    """Returns a reader for a given input VCF file.
+
+    Given a filename, filetype, compression option, and optional Subsampling
+    options, will return a pysam.VariantFile object for iteration and
+    a flag as to whether this file is compressed or uncompressed.
+
+    Parameters
+    ----------
+    vcfname : str
+        Filename for VCF file. The extension of this file will be used to
+        determine whether it is compressed or not unless `var_ext` is set.
+    var_ext : str (None)
+        Extension for VCF file if it is not included in the filename.
+    compress_flag : bool (False)
+        If filetype is uncompressed and this is set to true, will run
+        compressVcf function.
+    subsamp_num : int (None)
+        If set, will randomly select `subsamp_num` individuals (not
+        genotypes) from the input VCF file and return a reader with
+        only those data.
+    subsamp_fn : str (None)
+        If set, will return a reader with only data from the samples listed
+        in the file provided. Cannot be used with other subsampling options.
+    subsamp_list : list (None)
+        If set, will return reader with records containing only
+        individuals named in the list. Cannot be used with other subsampling
+        options.
+
+    Returns
+    -------
+    vcf_reader : pysam.VariantFile
+        A reader that can be iterated through for variant records. If
+        compressed, it will be able to use the pysam fetch method, otherwise
+        it must be read through sequentially
+    reader_uncompressed : bool
+        If True, VCF reader is uncompressed. This means the fetch method
+        cannot be used and region access must be done using the
+        "getRecordListUnzipped" method.
+
+    """
+    ext = checkFormat(vcfname)
+    if ext in ['gzip','other'] :
+        raise Exception(('Input file %s is gzip-formatted, must be either '
+                         'uncompressed or zipped with bgzip' % vcfname))
+    file_uncompressed = (ext == 'vcf')
+    reader_uncompressed = (file_uncompressed and not compress_flag)
+    if compress_flag and file_uncompressed:
+        vcfname = compressVcf(vcfname)
+    #subsamp_list = None
+    if subsamp_num is not None:
+        if subsamp_list is not None:
+            raise Exception('Multiple subsampling options called in getVcfReader')
+        subsamp_list = getSubsampleList(vcfname, subsamp_num)
+    elif subsamp_fn is not None:
+        if subsamp_list is not None:
+            raise Exception('Multiple subsampling options called in getVcfReader')
+        subsamp_file = open(subsamp_fn,'r')
+        subsamp_list = [l.strip() for l in subsamp_file.readlines()]
+        subsamp_file.close()
+    if index is None:
+        vcf_reader = pysam.VariantFile(vcfname)
+    else:
+        vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
+    if subsamp_list is not None:
+        logging.debug('Subsampling %d individuals from VCF file' %
+        (len(subsamp_list)))
+        vcf_reader.subset_samples(subsamp_list)
+    return vcf_reader, reader_uncompressed