Mercurial > repos > jaredgk > temple_ppp
changeset 0:9127b49793a1 draft default tip
Uploaded
| author | jaredgk |
|---|---|
| date | Fri, 12 Oct 2018 14:14:52 -0400 |
| parents | |
| children | |
| files | four_gamete_pysam.py four_gamete_wrapper.xml gene_region.py logging_module.py vcf_reader_func.py |
| diffstat | 5 files changed, 1900 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/four_gamete_pysam.py Fri Oct 12 14:14:52 2018 -0400 @@ -0,0 +1,899 @@ + +""" + samples an interval based on four-gamete tests from aligned sequences +""" +""" + written for python 3.6 + 2 possible kinds of 4 gamete-based intervals: + HandK85, minimal parsimonious set of intervals, such that each contains + one or more recombination events: + each interval has endpoints that are informative sites that are + incompatible on the basis of 4gamete criterion + each is inferred to contain at least 1 recombination event + a set of intervals form a parsimonious non-overlapping set that + are consistent with the data regions not in those intervals show no + evidence of recombination + CONTIG4GPASS, set of most inclusive intervals, such that all the sites + in each are fully compatible with each other: + each interval is a stretch of bases that contain zero incompatible + sites by the 4 gamete criterion + on each side, the left and right, the next polymorphic site that + is not in the interval must be incompatible with one of the + polymorphic sites that is in the interval. + + input filetypes: FASTA, SITE, VCF + kinds of intervals: HandK85 CONTIG4GPASS (default) + kinds of returns: + all intervals + single interval + single interval return options: + random interval - uniform over list of intervals + randombase interval - prob sampling proportional to interval length + leftmost + rightmost + interval with at least n informative sites: + random_n + randombase_n + leftmost_n + rightmost_n + Maximum number of informative SNPs is 1000 + set in buildlistsofsites() + +""" +import sys +import argparse +import logging +from logging_module import initLogger +import random +from gene_region import Region, RegionList +import pysam +from vcf_reader_func import getRecordList, vcfRegionName, getRecordsInRegion, VcfReader + + +class BaseData(): + + def __init__(self, args, format, filename, region=None): + self.poslist = [] + self.seqs = [] + self.format = format + self.header = None + self.records = [] + if format == 'VCF': + self.onlysnps = True + vcff = VcfReader(filename,index=args.tabix_index) + self.allrecords = vcff.getRecordList(region=region, chrom=args.chrom) + self.checkVcfRegion() + self.header = vcff.reader.header + vcff.close() + elif format == "FASTA": + self.onlysnps = False + f = open(filename,"r") + seq = "" + while True: + s = f.readline().strip() + if len(s) == 0: + if len(seq) > 0: + self.seqs.append(seq) + break + else: + if s[0] == '>': + if len(seq) > 0: + self.seqs.append(seq) + seq = "" + else: + s = s.strip() + if len(s) > 0: + seq += s + f.close() + elif format == "SIT": + self.onlysnps = False + f = open(filename,"r") + f.readline() + while True: + s = f.readline() + if s[0] != '#': + break + numseq = int(s.split()[0]) + seqlen = int(s.split()[1]) + s = f.readline().split() + + numnoncode = int(s[1]) + if numnoncode != seqlen: + for i in range(numnoncode): + f.readline() + s = f.readline() + if s.find("All") < 0: + numpops = int(s.strip()) + for i in range(numpops): + f.readline() + ## now should be at beginning of data + for i in range(numseq): + self.seqs.append(f.readline().strip()[10:]) + f.close() + self.numbases = listcheckallsamelength(self.seqs) + logging.info('Sequence length: %d' % self.numbases) + self.seqcount = len(self.seqs) + logging.info('Sequence count: %d' % self.seqcount) + #self.polysites, self.informpolysites, self.cintervals, self.cmat = + self.buildlistofsites(args.includesnpmissingdata, args.only2basesnps) + + def buildlistofsites(self, includesnpmissingdata, only2basesnps): + MAXINFORMSNPS = 1000 + #seqcount,numbases,seqs,list_of_positions,ONLYSNPS = getseqlist(basedata.filename,format) + # Find polymorphic sites + self.polysites = [] + self.informpolysites = [] + informpolyseqs = ['']*self.seqcount ## becomes a list of strings of informative sites, same length as seqs, not used as of 8/14/2017 + baseset = set(['a','c','g','t','A','C','G','T']) + linesets = [] + for j in range(self.numbases): + ok = True + basecount = [0,0,0,0] + for k in range(self.seqcount): + if self.seqs[k][j].isalpha() == False: + raise Exception ("non-alphabetic character %s" + " at position %d in sequence %d"%(self.seqs[k][j],k,j)) + if self.seqs[k][j] not in baseset: ## skip sites with missing data + if not includesnpmissingdata: + ok = False + if self.seqs[k][j].upper() == 'A' : + basecount[0] = basecount[0] + 1 + if self.seqs[k][j].upper() == 'C' : + basecount[1] = basecount[1] + 1 + if self.seqs[k][j].upper() == 'G' : + basecount[2] = basecount[2] + 1 + if self.seqs[k][j].upper() == 'T' : + basecount[3] = basecount[3] + 1 + if ok == False: + pass + # print ("missing data at position " + str(j)) + else: + polybase = 0 + twobase = 0 + for bi in range(4): + if basecount[bi] > 0 : + polybase += 1 + if basecount[bi] > 1 : + twobase += + 1 + if only2basesnps == True and polybase > 2 : + pass + # print ("more than two bases at position " + str(j)) + elif polybase > 1: + self.polysites.append(j+1) ## use regular (start at 1) count + if (twobase > 1): + self.informpolysites.append(j+1) + lineset =[set([]),set([]),set([]),set([])] + for k in range(self.seqcount): + if self.seqs[k][j].upper() == 'A' : + lineset[0].add(k) + if self.seqs[k][j].upper() == 'C' : + lineset[1].add(k) + if self.seqs[k][j].upper() == 'G' : + lineset[2].add(k) + if self.seqs[k][j].upper() == 'T' : + lineset[3].add(k) + linesets.append(lineset) + if len(self.informpolysites) >= MAXINFORMSNPS: + logging.warning('Maximum SNP count of %d has been hit'%(MAXINFORMSNPS)) + break + logging.info('Informative site count: %d' % len(self.informpolysites)) + #if len(self.informpolysites) > MAXINFORMSNPS: + # raise Exception("the number of informative snps: %d" +# " exceeds the maximum: %d"%(len(self.informpolysites),MAXINFORMSNPS)) + c = 0 + sl = len(linesets) + if sl > 1 : + self.cmat = [] + self.cintervals = [] + for i in range(sl): + self.cmat.append([]) + for j in range(sl): + if i < j : + if self.records[i].chrom != self.records[j].chrom or compat(linesets[i],linesets[j]) == False : + self.cmat[i].append(0) + self.cintervals.append([self.informpolysites[i],self.informpolysites[j]]) + else: + self.cmat[i].append(1) + else: + if i==j: + self.cmat[i].append(1) + else: + self.cmat[i].append(-1) + #print (self.cintervals) + + #def compatibleintervals(cmat,informpolysites,numbases, + #list_of_positions,ONLYSNPS): + def compatibleintervals(self): + """ + return a list of intervals + CONTIG4GPASS, set of most inclusive intervals, such that all the sites + in each are fully compatible with each other: + each interval is a stretch of bases that contain zero incompatible + sites by the 4 gamete criterion + on each side, the left and right, the next polymorphic site that + is not in the interval must be incompatible + with one of the polymorphic sites that is in the interval. + + intervals are based only on informative sites + it is possible for intervals could be extended to include flanking + non-informative polymorphic sites + probably would have little effect in most contexts + """ + tempinformpolysites = list(self.informpolysites) + cints = [] + nps = len(self.informpolysites) + cintinformlistfull = [] + p = 0 + while p < nps: + j = p + donej = False + cintinform = 1 + while True: + j += 1 + if j >= nps or self.cmat[p][j] != 1: + j -= 1 + donej = True + break + else: + for i in range(p+1,j+1): + if self.cmat[i][j] != 1: + donej = True + j -= 1 + break + if donej: + break + else: + cintinform += 1 + # if j > p: + cints.append([tempinformpolysites[p],tempinformpolysites[j]]) + cintinformlistfull.append(cintinform) + p += 1 + ## now remove all intervals that are overlapped by a larger one + cintinformremovelist = [] + c = len(cints) + removelist = [] + for ai in range(c-1) : + for bi in range(ai+1,c) : + assert (cints[ai][0] <= cints[ai][1]) + assert (cints[bi][0] <= cints[bi][1]) + i = cints[ai][0] + j = cints[ai][1] + m = cints[bi][0] + n = cints[bi][1] + if (m <= i) and (j <= n): + if cints[bi] not in removelist: + removelist.append(cints[ai]) + cintinformremovelist.append(ai) + if (i <= m) and (n <= j): + if cints[ai] not in removelist: + removelist.append(cints[bi]) + cintinformremovelist.append(bi) + for interval in removelist: + cints.remove(interval) + cintinformlist = [] + for i in range(c): + if i not in cintinformremovelist: + cintinformlist.append(cintinformlistfull[i]) + # now pad intervals with sequences to the flanking ones + cintspadded = [] + if False: + cints = replace_positions(cints,self.poslist) + return cints,cintinformlist + if self.onlysnps: + ## using a vcf file, intervals are 1-based but positions in list_of_positions are 0-based + for ci in range(len(cints)): + temp = [-1,-1] + if ci == 0: + assert cints[ci][0] > 0 + temp[0] = self.poslist[cints[ci][0] - 1] ## -1 because list_of_positions is 0-based + else: + temp[0] = self.poslist[self.informpolysites[self.informpolysites.index(cints[ci][0]) -1] -1] + 1 ## position of previous informative snp + 1 + if ci == len(cints)-1: + temp[1] = self.poslist[cints[ci][1] -1] + else: + temp[1] = self.poslist[self.informpolysites[self.informpolysites.index(cints[ci][1]) + 1] -1] -1 ## position of next informative snp -1 + cintspadded.append(temp) + else: + for ci in range(len(cints)): + temp = [-1,-1] + if ci == 0: + temp[0] = 1 + else: + temp[0] = self.informpolysites[self.informpolysites.index(cints[ci][0]) -1] + 1 ## position of previous informative snp + 1 + if ci == len(cints)-1: + temp[1] = numbases + else: + temp[1] = self.informpolysites[self.informpolysites.index(cints[ci][1]) + 1] -1 ## position of next informative snp -1 + cintspadded.append(temp) + return cintspadded,cintinformlist + #return cints,cintinformlist + + #def hudsonkaplan85intervals(cintervals,numbases,list_of_positions) : + def hk85intervals(self): + """ + implements the HK algorithm from appendix 2 of Hudson & Kaplan 1985 + + Rm is the number of recombination events that can be + parsimoniously inferred from a sample of sequences + D is a compatibility matrix d[i,j] == 1 if all 4 gametes are present, + 0 otherwise + consider for convenience only i<j i.e. above the diagonal + + H&K algorithm prunes D to leave the minimum set of intervals + necessary to explain the data + + pruning steps: + for two intervals (i,j) and (m,n) if m <= i< j <= n remove (m,n) + + for remaining intervals do these steps to remove + non disjoint intervals + identify an interval that is not disjoint from all + the others (if none can be found we are done) + call this (i1,j1) + + now scan all intervals (m,n) and delete + all those for which i1 < m < j2 + + now search other intervals to find (i2,j2) such that i2 >= j1 + and this interval is not disjoint from all the + remaining intervals + now using (i2,j2) delete all intervals + whose first component is <j2 and >i2 + """ + c = len(self.cintervals) + removelist = [] + for ai in range(c-1) : + for bi in range(ai+1,c) : + assert (self.cintervals[ai][0] < self.cintervals[ai][1]) + assert (self.cintervals[bi][0] < self.cintervals[bi][1]) + i = self.cintervals[ai][0] + j = self.cintervals[ai][1] + m = self.cintervals[bi][0] + n = self.cintervals[bi][1] + if (m <= i) and (j <= n): + if self.cintervals[bi] not in removelist: + removelist.append(self.cintervals[bi]) + if (i <= m) and (n <= j): + if self.cintervals[ai] not in removelist: + removelist.append(self.cintervals[ai]) + for interval in removelist: + self.cintervals.remove(interval) + c = len(self.cintervals) + ai = 0 + while ai < c-1: + removelist = [] + for bi in range(ai+1,c): + i1 = self.cintervals[ai][0] + j1 = self.cintervals[ai][1] + b = self.cintervals[bi][0] + e = self.cintervals[bi][1] + if (i1 < b < j1) or (b < i1 < e): + removelist.append(self.cintervals[bi]) + for interval in removelist: + self.cintervals.remove(interval) + + i1 = self.cintervals[ai][0] + j1 = self.cintervals[ai][1] + xi = ai + 1 + c = len(self.cintervals) + while xi < c: + if self.cintervals[xi][0] >= j1: + removelist = [] + for bi in range(xi+1,c): + i2 = self.cintervals[xi][0] + j2 = self.cintervals[xi][1] + b = self.cintervals[bi][0] + e = self.cintervals[bi][1] + if (i2 < b < j2): + removelist.append(self.cintervals[bi]) + for interval in removelist: + self.cintervals.remove(interval) + c = len(self.cintervals) + xi += 1 + c = len(self.cintervals) + ai += 1 + # reset snp positions if needed + if len(self.poslist) > 0: + self.cintervals = replace_positions(self.cintervals,self.poslist) + return self.cintervals + + def checkVcfRegion(self): + """ + Checks that variants in region are SNPs and creates a list with valid sites. Also sets ploidy and position list of valid sites. + + """ + valid_bases = ['A','C','G','T'] + skiplines = [] + self.ploidy = -1 + self.poslist = [] + base_list = [] + self.records = [] + for record in self.allrecords: + lineok=True + for allele in record.alleles: + if allele not in valid_bases: + lineok=False + break + if not lineok: + continue + somedata = False + #data_present = 0 + var_list = [] + #for indiv in record.samples: + for i in range(len(record.samples)): + indiv = record.samples[i] + if self.ploidy == -1: + self.ploidy = len(indiv.alleles) + if not somedata and indiv.alleles.count(None) == 0: + somedata = True + for samp_allele in indiv.alleles: + if samp_allele is None: + var_list.append('.') + else: + var_list.append(samp_allele) + if somedata: + base_list.append(var_list) + self.poslist.append(record.pos) + self.records.append(record) + self.seqs = [[base_list[row][col] for row in range(0, len(base_list))] for col in range(0, len(base_list[0]))] + + +def createParser(): + parser = argparse.ArgumentParser(description=("Given a file of aligned" + "sequences or variable sites, will return intervals based" + " on 4 gamete tests")) + #parser.add_argument("filename", help="Input filename") + filetype_group = parser.add_mutually_exclusive_group(required=True) + filetype_group.add_argument("--fasta", dest = "fasta", + nargs="+", help = "fasta file(s) of" + " multiple aligned sequences, all the same length") + filetype_group.add_argument("--sit", dest = "sit", + nargs="+", help = " SITES format file(s)") + filetype_group.add_argument("--vcfreg", dest = "vcfreg", nargs=2, + help = "VCF file and region list, in that order") + filetype_group.add_argument("--vcfname", dest="vcfname", nargs="+") + output_group = parser.add_mutually_exclusive_group() + output_group.add_argument("--out", dest="out") + output_group.add_argument("--out-prefix", dest="out_prefix") + intervaltype_group = parser.add_mutually_exclusive_group(required= + "--reti" in sys.argv) + intervaltype_group.add_argument("--hk", dest="intervaltype", + action='store_const', const="HandK85",help = "get a set of " + "parsimonious intervals each containing at least 1 recombination" + " event. Follows appendix 2 of Hudson and Kaplan 1985") + intervaltype_group.add_argument("--4gcompat", dest="intervaltype", + action='store_const', const="CONTIG4GPASS",help = "get a set of " + " intervals such that for each interval all the included bases are" + " compatible with each other based on the 4 gamete test") + returntype_group = parser.add_mutually_exclusive_group(required=True) + returntype_group.add_argument("--reti", dest="returntype", + action='store_const', const="returninterval",help = " return single " + " interval") + returntype_group.add_argument("--retl", dest="returntype", + action='store_const', const="returnlist", help = " return a list of" + " all intervals") + returntype_group = parser.add_mutually_exclusive_group(required=False) + returntype_group.add_argument("--rani", dest="intervalpicktype", + action='store_const', const="randominterval",default = + "leftinterval", help = "return a randomly selected interval" + " from the list of all intervals") + returntype_group.add_argument("--ranb", dest="intervalpicktype", + action='store_const', const="randombase", default="leftinterval", + help = "return a single randomly selected interval with probabiliy " + "of selection proportional to interval length") + returntype_group.add_argument("--left", dest="intervalpicktype", + action='store_const', const="leftinterval", default="leftinterval", + help = "return the leftmost interval") + returntype_group.add_argument("--right", dest="intervalpicktype", + action='store_const', const="rightinterval", + default="leftinterval",help = "return the rightmost interval") + parser.add_argument("--numinf",dest= "numinformativesites", + action="store",nargs = 1,type = int, default = None,help = + "limit the picking of an interval to only those with at least this" + " many informative snps") + parser.add_argument("--b2",dest="only2basesnps",action="store_true", + default=False,help = "causes snps with more than 2 bases " + "to be ignored, default is to include them") + parser.add_argument("--incN", dest="includesnpmissingdata", + action="store_true", default=False,help="causes snps with missing" + " data to be included, default is to ignore them") + parser.add_argument("--ranseed", dest="ranseed", type=int, + action="store",nargs = 1,default = None, help="positive integer to" + " serve as the random number seed " + "for use with --ranb or --rani, not required but" + " useful when debugging") + parser.add_argument("--chrom", dest="chrom", help="Select variants " + "from a single specified chromosome") + #add checks for correct input type + parser.add_argument('--tbi', dest="tabix_index") + return parser + +def logArgs(args): + logging.info('Arguments for vcf_region_write:') + for k in vars(args): + logging.info('Argument %s: %s' % (k, vars(args)[k])) + +def compat(s1, s2): + """ + s1 and s2 are lists of sets of line numbers, each representing a particular + polymorphic site. + Each list as 4 sets, one for each base: A,C,G & T. + Each set is all the line numbers that share that particular base. + So then a polymorphic site represented by s1 will have + two or more nonempty sets in s1. + Same for s2 + + compat() counts the number of pairs of sets, betweeen s1 and s2, + for all possible pairs, for which the size of the intersection is nonzero + and less than the size of either set. If this is 4 or more then the + two sites are not compatible + + this should work for 2,3 or 4 polymorphic bases in either site + + """ + count = 0 + for i1 in range(len(s1)): + for i2 in range(len(s2)): + if len(s1[i1]) > 1 and len(s2[i2]) > 1: + intersect = s1[i1] & s2[i2] + lenintersect = len(intersect) + if (lenintersect > 0 and lenintersect < len(s1[i1]) + and lenintersect < len(s2[i2])): + count = count + 1 + if count >= 4 : + return False + else : + return True + +def listcheckallsamelength(lcheck): + """ + takes a list of lists or strings and checks to see that + they are all the same length returns the length that is shared by all + """ + i = 0 + for x in lcheck: + try: + lval = len(x) + except TypeError: + print('position %d not compatible with len()'%i) + if i>0: + assert lval == lastlval + lastlval = lval + i+=1 + return lval + + + + +def replace_positions(intervals,list_of_positions): + """ + with vcf files the base number of snp locations is contained + in list_of_positions and the value in intervals[] are simply + the index positions of the snps in the vcf files. + + This function rewrites the intervals to reflect the original + snp base positions + """ + bintervals = [] + for iv in intervals: + #logging.info('Index: %d %d' % (iv[0],iv[1])) + #logging.info('Length: %d' % len(list_of_positions)) + bintervals.append([list_of_positions[iv[0]-1],list_of_positions[iv[1]-1]+1]) + return bintervals + + +def sampleinterval(picktype,numinf,intervals,infcounts): + """ + picktype : randominterval, randombase,left,right + numinf : integer or None + intervals : list of intervals + infcounts: list of informative site counts, or None + """ + if len(intervals) == 0: + return None + + #if picktype == "leftinterval": + # return intervals[0] + #if picktype == "rightinterval": + # return intervals[-1] + if numinf != None: + numinf = numinf[0] + if numinf != None and infcounts != None: + okints = [i for i in range(len(infcounts)) if infcounts[i] >= numinf ] + else: + okints = [i for i in range(len(intervals))] + if picktype == "leftinterval": + return intervals[okints[0]] + if picktype == "rightinterval": + return intervals[okints[-1]] + n = len(okints) + if n == 0: # no intervals with that many informative sites + return None + if picktype == "randominterval": + return intervals[ okints[ random.randint(0,n-1)]] + if picktype == "randombase": + sumbase = 0 + for i in range(n): + sumbase += intervals[okints[i]][1] - intervals[okints[i]][0] + 1 + rbn = random.randint(1,sumbase) + i = 0 + while True: + if rbn > intervals[okints[i]][1] - intervals[okints[i]][0] + 1: + rbn -= intervals[okints[i]][1] - intervals[okints[i]][0] + 1 + i += 1 + else: + break + return intervals[ okints[i]] + assert False # should not get here + return None + +def getIntervalList(args, basedata): + """ + Returns list of intervals from either hk85 or 4gcompat function. + Also controls return based on args provided (list or single interval) + """ + + if args.intervaltype == "HandK85": + #hk code doesn't return counts of the informatics sites in each interval + intervals = basedata.hk85intervals() + if args.numinformativesites != None: + raise Exception("--numinf not implemented when using --hk") + intervalinformativecounts = None # not implemented for hk + numinf = None # set to None, hk use of # inform sites not implemented + + else: + intervals, intervalinformativecounts = basedata.compatibleintervals() + + if args.returntype == 'returnlist': + return intervals + else: + interval = sampleinterval(args.intervalpicktype, + args.numinformativesites, + intervals,intervalinformativecounts) + return interval + + +def outputSubregion(args, interval, basedata, region=None, filename=None): + #create new region from interval + #find records in range of new interval + #open file with prefix and region name + #write records, close file + """ + Outputs records in identified sub-interval to desired output file + """ + if region is None: + subregion = Region(interval[0],interval[1]+1,basedata.records[0].chrom) + else: + subregion = Region(interval[0],interval[1]+1,region.chrom) + subrecords = getRecordsInRegion(subregion, basedata.records) + if filename is None: + subfn = vcfRegionName(args.out_prefix,subregion,"vcf.gz") + else: + subfn = filename + #print (interval) + #print (subfn) + subf = pysam.VariantFile(subfn, 'w', header=basedata.header) + for record in subrecords: + subf.write(record) + subf.close() + + + + + +def sample_fourgametetest_intervals(sys_args): + """Returns interval(s) from aligned sequences or snps. + + Given a set of aligned sequences or a vcf file, this function + returns intervals that have been identified based on four-gamete + incombpatibility. Intervals can either flank a recombination event or + be sure not to contain an incombatibility. + + + Parameters + ---------- + + Input (one required): + --fasta : str + Filename(s) of input FASTA file for testing + --vcfreg : str, str + Filename of input VCF file and BED file with list of + regions to run tests on + --vcfname : str + Filename(s) of VCF files, each corresponding to a region, that will be subsampled for compatible intervals + --sit : str + Filename(s) of input SITES format files for testing + + Output types: + --out : str + Name of single output file. + --out-prefix : str + Prefix for name of output files + --out-reg : str + Name of file that stores passing interval per interval in input BED file (currently not working) + + Tests (one required) + --hk : bool + Will find intervals that contain at least one recombination event + --4gcompat : bool + Will find intervals with zero recombination events in them + + Return options: + --reti : bool + Output will be of a single region, indicated by rani/ranb/left/right + --retl : bool + Output will be of all passing regions + + For single region: + --left : bool (default) + Single output region will be left-most region with enough informative sites + --right : bool + Output region will be right-most region + --rani : bool + Region will be randomly selected, with each region having equal chance of selection + --ranb : bool + Region will be randomly selected, with each region having selection odds proportional to size + + General options: + --numinf : int + Number of sites in an identified interval required for reporting (default is 1) + --incN : bool + Include SNPs with missing data (default is to ignore them) + --ranseed : int + Seed for picking random interval from list + --tbi : str + Path to tabix index for input VCF file if it's name doesn't match + + """ + parser = createParser() + # parser.print_help() + if len(sys_args) == 0: + parser.print_help() + sys.exit(1) + args = parser.parse_args(sys_args) + logArgs(args) + #argdic = vars(args) + interval_list = [] + multiple_regions = False + region_output = False + if args.ranseed != None: + random.seed(int(args.ranseed[0])) + filetype = '' + out_list = (args.out is None and args.out_prefix is None) + + if args.vcfreg is not None: + filetype = "VCF" + vcfname = args.vcfreg[0] + regname = args.vcfreg[1] + if regname != '-': + region_list = RegionList(filename=regname) + #region_output = True + multiple_regions = (len(region_list.regions) > 1) + if multiple_regions and not out_list and args.out is not None: + raise Exception(("VCF with multiple regions cannot have " + "single output file (--out)")) + for region in region_list.regions: + basedata = BaseData(args, "VCF",vcfname, region) + intervals = getIntervalList(args, basedata) + interval_list.append(intervals) + if args.out_prefix is not None: + if args.returntype == 'returnlist': + for iv in intervals: + outputSubregion(args, iv, basedata) + else: + outputSubregion(args, intervals, basedata) + elif args.out is not None: + if args.returntype == 'returnlist': + raise Exception('Multiple outputs directed to single file') + outputSubregion(args, intervals, basedata, region=region, filename=args.out) + + else: + basedata = BaseData(args, "VCF",vcfname) + intervals = getIntervalList(args,basedata) + interval_list.append(intervals) + if args.out_prefix is not None: + if args.returntype == 'returnlist': + for iv in intervals: + outputSubregion(args, iv, basedata) + else: + outputSubregion(args, intervals, basedata) + elif out_list: + return interval_list + elif args.out is not None: + if args.returntype == 'returnlist': + raise Exception(('If --retl is specified, output must not ' + 'be --out')) + outputSubregion(args, intervals, basedata, filename=args.out) + + elif args.vcfname is not None: + filetype="VCF" + if len(args.vcfname) > 1 and args.out_prefix is not None: + raise Exception(("Multiple VCFs require --out-prefix flag")) + for vcfname in args.vcfname: + basedata = BaseData(args, "VCF",vcfname) + intervals = getIntervalList(args,basedata) + interval_list.append(intervals) + if args.out_prefix is not None: + if args.returntype == 'returnlist': + for iv in intervals: + outputSubregion(args, iv, basedata) + else: + outputSubregion(args, intervals, basedata) + elif out_list: + return interval_list + elif args.out is not None: + if args.returntype == 'returnlist': + raise Exception(('If --retl is specified, output must not ' + 'be --out')) + outputSubregion(args, intervals, basedata, filename=args.out) + elif args.fasta is not None: + filetype="FASTA" + if len(args.fasta) > 1 and args.out_prefix is not None: + raise Exception(("Multiple FASTAs require --out-prefix flag")) + for fastaname in args.fasta: + basedata = BaseData(args, "FASTA", fastaname) + intervals = getIntervalList(args, basedata) + interval_list.append(intervals) + elif args.sit is not None: + filetype="SIT" + if len(args.sit) > 1 and args.out_prefix is not None: + raise Exception(("Multiple SITES files require --out-prefix flag")) + if args.out_reg is not None: + raise Exception(("SITES files cannot be output to region file")) + for sitname in args.sit: + intervals = getIntervalList(args, "SIT", sitname) + interval_list.append(intervals) + + + + + return interval_list + # + + +if __name__ == '__main__': + initLogger() + + ### some testing command lines + #sample_fourgametetest_intervals([]) + #exit() + if len(sys.argv) > 1: + print (sample_fourgametetest_intervals(sys.argv[1:])) + exit(0) + testlist = [] + vcf_name = "example/chr11subsamples4gtest.vcf.gz" + fasta_name = "example/fourgfastatest.fasta" + testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name, + "-", "--4gcompat", "--reti", "--ranseed", "123"]) + testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name, + "-", "--hk", "--retl"]) + testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name, + "-", "--hk", "--reti","--left"]) + testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name, + "-", "--4gcompat", "--retl"]) + testlist.append( ["test #" + str(len(testlist)),"--vcf", vcf_name, + "-", "--4gcompat", "--reti","--ranseed","123","--ranb"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, + "--4gcompat", "--retl"]) + testlist.append( ["test #" + str(len(testlist)),"--fasta", fasta_name, + "--4gcompat", "--retl","--incN","--b2"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, + "--hk","--retl"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--hk", + "--retl","--incN","--b2"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--hk", + "--reti","--incN","--left"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat", + "--reti","--incN","--numinf","2","--ranb"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat", + "--reti","--incN","--numinf","2","--rani"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat", + "--reti","--incN","--numinf","4","--ranb"]) + testlist.append( ["test #" + str(len(testlist)), "--fasta", fasta_name, "--4gcompat", + "--reti","--incN","--numinf","100","--ranb"]) + print ("%d test jobs"%(len(testlist)) ) + for tl in testlist: + sys.argv = tl + a = sample_fourgametetest_intervals(sys.argv[1:]) + print(sys.argv[0],a) + exit()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/four_gamete_wrapper.xml Fri Oct 12 14:14:52 2018 -0400 @@ -0,0 +1,54 @@ +<tool id="fourgamete" name="Four Gamete Test" version="0.1"> + <requirements> + <requirement type="package">pysam</requirement> + </requirements> + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + <description>Tests a VCF file for a subregion that passes four gamete test</description> + <command> + <![CDATA[ + python $__tool_directory__/four_gamete_pysam.py + --vcfname $input + --out $output + --4gcompat + --reti + #if str($select_arg) == 'left': + --left + #elif str($select_arg) == 'right': + --right + #elif str($select_arg) == 'randomi': + --rani + #else: + --ranb + #end if + --numinf $infsites + #if $input.is_of_type('vcf_bgzip') + --tbi $input.metadata.tabix_index + #end if + + ]]> + + </command> + <inputs> + <param format="vcf,vcf_bgzip,bcf" name="input" type="data" label="VCF input" /> + <param type="integer" name="infsites" label="Number of informative sites required in subregion" value="2" min="0" /> + <param name="select_arg" type="select" label="Subregion to Select"> + <option value="left">Left-most</option> + <option value="right" selected="True">Right-most</option> + <option value="randomi">Random (equal probability per region)</option> + <option value="randomb">Random (probability equal to size)</option> + </param> + </inputs> + <outputs> + <data format="vcf" name="output"/> + </outputs> + <help> + <![CDATA[ + + Input VCF will be examined for regions that pass the four-gamete test, which indicates there have been no recombination in this region for this sample. For most cases, a failure of the four-gamete test occurs when two SNPs have four gametes between them. (00,01,10,11) Using selection criteria, a region with the target number of informative SNPs that passes the four-gamete criteria will be output to a VCF file. + + ]]> + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_region.py Fri Oct 12 14:14:52 2018 -0400 @@ -0,0 +1,376 @@ +import sys +import re +import logging +from functools import total_ordering +from random import shuffle + + +def getChromKey(chrom): + """Get key from chromosome string so that it can be naturally sorted + (int < string, numbers grouped together) + """ + c = chrom + if chrom[0:3] == 'chr': + c = chrom[3:] + convert = lambda text: int(text) if text.isdigit() else text.lower() + k = [ convert(i) for i in re.split('([0-9]+)',c) if len(i) != 0] + return k + +def keyComp(k1,k2): + """Returns True if k1 < k2, False if not. int < str""" + for i in range(min(len(k1),len(k2))): + e1 = k1[i] + e2 = k2[i] + if isinstance(e1,int) != isinstance(e2,int): + return isinstance(e1,int) + if e1 != e2: + return e1 < e2 + return len(k1) < len(k2) + +def setRegionSort(sorttype,sortlist=None): + valid_types = ['natural','string','list'] + if sorttype not in valid_types: + raise Exception("Sorting type %s is not valid" % (sorttype)) + Region.sort_method = sorttype + if sorttype == 'list' and sortlist is not None: + setSortOrder(sortlist) + +def setSortOrder(sortlist,trimchrom=False): + hold_list = [] + for c in sortlist: + if trimchrom and c[0:3] == 'chr': + hold_list.append(c[3:]) + else: + hold_list.append(c) + Region.sort_order = hold_list + +@total_ordering +class Region: + sort_method = "natural" + sort_order = None + + def __init__(self, start, end, chrom): + """Zero-based, half open coordinates and chromosome info for + a region in a genome. Coords will be formatted according to + flags passed to RegionList, then stored in a single format. + """ + self.start = start + self.end = end + self.chrom = chrom + #if chrom[0:3] == 'chr': + # self.chrom = chrom[3:] + #else: + # self.chrom = chrom + def __deepcopy__(self): + return Region(self.start,self.end,self.chrom) + + def cloneRegion(self): + return Region(self.start,self.end,self.chrom) + + + def getChromKey(self): + """Splits chromosone name for natural sort. Ints and strs are + grouped with adjacent elements of the same type + + Example: 'front88place' returns ['front',88,'place'] + """ + return getChromKey(self.chrom) + + + def __eq__(self, other): + return ((self.start == other.start) and (self.end == other.end) + and (self.chrom == other.chrom)) + + def __lt__(self, other): + """Sort key order: chrom-key, start position, end position + """ + if Region.sort_method == 'natural': + k1 = self.getChromKey() + k2 = other.getChromKey() + if k1 != k2: + return keyComp(k1,k2) + elif Region.sort_method == "string": + #k1 = self.chrom + #k2 = other.chrom + k1 = (self.chrom if self.chrom[:3] != 'chr' else self.chrom[3:]) + k2 = (other.chrom if other.chrom[:3] != 'chr' else other.chrom[3:]) + if k1 != k2: + return k1 < k2 + else: + if Region.sort_order is None: + raise Exception("Sorting by list requires calling setSortOrder() and providing a list for the order") + k1 = self.chrom + k2 = other.chrom + if k1 not in Region.sort_order: + raise Exception("Key %s is not in sort order list" % (k1)) + if k2 not in Region.sort_order: + raise Exception("Key %s is not in sort order list" % (k2)) + if k1 != k2: + return Region.sort_order.index(k1) < Region.sort_order.index(k2) + if self.start != other.start: + return self.start < other.start + return self.end < other.end + + def __len__(self): + return self.end-self.start + + def containsRecord(self, rec): + k1 = self.getChromKey() + kr = getChromKey(rec.chrom) + if k1 != kr: + if keyComp(k1,kr): + return 'before' + return 'after' + comp_pos = rec.pos-1 + if comp_pos < self.start: + return 'before' + if comp_pos >= self.end: + return 'after' + return 'in' + + def toStr(self, zeroho=False, zeroclosed=False, sep=':'): + start = self.start + end = self.end + if not zeroho: + start += 1 + elif zeroclosed: + end -= 1 + return str(self.chrom)+sep+str(start)+sep+str(end) + #return str(start)+sep+str(end)+sep+str(self.chrom) + + def getReference(self, refseq): + return refseq.fetch(self.chrom,self.start,self.end) + + + + +class RegionList: + + def __init__(self, filename=None, genestr=None, reglist=None, + zeroclosed=False, zeroho=False, + colstr=None, sortlist=True, checkoverlap=None, + sortmethod=None, sortorder=None, chromfilter=None, + list_template=None, randomize=False): + """Class for storing gene region information + + Will either read a file or take a single gene region in a string + as input, then create an object with some metadata and a list of + gene regions. + + Parameters + ---------- + filename : str (None) + Name of file with gene coordinate data + genestr : str (None) + Semicolon-separated list of region data, either in "start:end" + or "start:end:chrom" format. + oneidx : bool (False) + If true, region will be read in as a one-index based string. + This results in one position being subtracted from the start + and end coordinates + colstr : str (None) + If reading in from a file with many columns, will indicate + what columns hold start/end data and chrom data if three + items are given. + defaultchrom : str (None) + If no chromosome data is provided in the region file or string, + will set the chromosome to this value + halfopen : bool (True) + If true, the region range will be [start,end), so the start + coordinate will be included in the region but the end + will not + + Exceptions + ---------- + Both filename and genestr are None + colstr has less than 2 or more than 3 values + + """ + self.collist = None + if list_template is not None: + zeroho = list_template.zeroho + zeroclosed = list_template.zeroclosed + sortlist = list_template.sortlist + checkoverlap = list_template.checkoverlap + sortorder = list_template.sortorder + + if self.collist is None: + if colstr is None: + self.collist = [1, 2, 0] + else: + self.parseCols(colstr) + if filename is None and genestr is None and reglist is None: + raise Exception(("A filename, region string, or list of " + "region strings must be provided for " "creating a gene region list")) + if [filename,genestr,reglist].count(None) < 2: + raise Exception(("Only one of filename, region string, and " + "region list can be specified for init")) + if zeroho and zeroclosed: + raise Exception(("Zero-halfopen and zero-closed options " + "cannot be invoked at the same time")) + + if sortlist and randomize: + raise Exception("Sorting and randomizing are not compatible options for RegionList") + if checkoverlap not in [None,'fix','error']: + raise Exception("Invalid checkoverlap value: %s" % (checkoverlap)) + self.regions = [] + self.zeroho = zeroho + self.zeroclosed = zeroclosed + self.chromfilter = chromfilter + if sortmethod is not None: + setRegionSort(sortmethod,sortlist=sortorder) + if filename is not None: + self.initFile(filename) + elif genestr is not None: + self.initStr(genestr) + else: + self.initList(reglist) + + if sortlist: + self.regions.sort() + if checkoverlap is not None: + if self.hasOverlap(): + if checkoverlap == "fix": + self.fixOverlap() + elif checkoverlap == "error": + raise Exception("Region overlap detected") + + if randomize: + shuffle(self.regions) + + def initFile(self,filename): + """Initialize RegionList with a region file + """ + with open(filename, 'r') as regionfile: + for line in regionfile: + if line[0] == '#': + continue + la = line.strip().split() + self.initRegion(la) + + + def initStr(self,genestr): + la = genestr.split(':') + self.initRegion(la) + + + def initList(self,reglist): + for reg in reglist: + self.initRegion(reg) + + + + def initRegion(self,la): + start = int(la[self.collist[0]]) + end = int(la[self.collist[1]]) + chrom = la[self.collist[2]] + if self.chromfilter is not None and chrom != self.chromfilter: + return + if not self.zeroho: + start -= 1 + elif self.zeroclosed: + end += 1 + self.regions.append(Region(start,end,chrom)) + + + def parseCols(self,cols): + col_list = [int(i) for i in cols.split(',')] + if len(col_list) != 3: + raise Exception(("Column string %s must have three " + "values" % (cols))) + self.collist = col_list + + def toStr(self, idx, sep=':'): + return self.regions[idx].toStr(zeroho=self.zeroho, + zeroclosed=self.zeroclosed, + sep=sep) + + def hasOverlap(self): + region_hold = sorted(self.regions) + for i in range(len(region_hold)-1): + if region_hold[i].chrom == region_hold[i+1].chrom: + if (region_hold[i].start <= region_hold[i+1].start + < region_hold[i].end): + logging.warning("Regions %s and %s overlap" % + (self.toStr(i),self.toStr(i+1))) + return True + return False + + def fixOverlap(self): + region_hold = [] + self.regions.sort() + i = 0 + temp_reg = None + for i in range(len(self.regions)): + if temp_reg is None: + temp_reg = self.regions[i].cloneRegion() + continue + cur_reg = self.regions[i].cloneRegion() + if cur_reg.start < temp_reg.end and cur_reg.chrom == temp_reg.chrom: + temp_reg.end = max(temp_reg.end,cur_reg.end) + else: + region_hold.append(temp_reg) + temp_reg = cur_reg + region_hold.append(temp_reg) + self.regions = region_hold + + def printList(self, file_handle=None, file_name=None, + return_str=False, delim="\t", add_chr=False): + if file_handle is None and file_name is None: + file_handle = sys.stdout + #raise Exception("Either file_handle or file_name must be set") + if file_name is not None: + file_handle = open(file_name, 'w') + if return_str: + out_str = '' + for region in self.regions: + start = region.start + end = region.end + if not self.zeroho: + start += 1 + elif self.zeroclosed: + end -= 1 + chrom = region.chrom + if add_chr: + chrom = "chr"+region.chrom + reg_str = chrom+delim+str(start)+delim+str(end)+'\n' + if return_str: + out_str += reg_str + else: + file_handle.write(reg_str) + if return_str: + return out_str + + def filterByChrom(self,chrom_list,include=False): + if include: + self.regions = [r for r in self.regions if r.chrom in chrom_list] + else: + self.regions = [r for r in self.regions if r.chrom not in chrom_list] + + def filterOutXY(self): + self.filterByChrom(['X','Y','chrX','chrY']) + + #TO do: + #Add sort method for strictly text based sorting + +def getIntervalsBetween(region_list, padding=0, firstline=True): + region_hold = [] + if len(region_list.regions) < 2: + raise Exception("Region list for complement requires at least two regions") + if firstline: + r1 = region_list.regions[0] + if r1.start - padding > 0: + region_hold.append([str(r1.chrom),0,r1.start-padding]) + for i in range(len(region_list.regions)-1): + r1 = region_list.regions[i] + rn = region_list.regions[i+1] + if r1.chrom != rn.chrom: + continue + new_start = r1.end+padding + new_end = rn.start-padding + if new_start < new_end: + region_hold.append([str(r1.chrom),new_start,new_end]) + out_list = RegionList(reglist=region_hold, zeroho=True) + out_list.zeroho = region_list.zeroho + out_list.zeroclosed = region_list.zeroclosed + return out_list
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/logging_module.py Fri Oct 12 14:14:52 2018 -0400 @@ -0,0 +1,97 @@ +import sys +import logging + + + +def initLogger(filename='pipeline.log', filelevel='INFO', + streamlevel='WARNING', resetlog=True): + """General logger initialization for PPP functions. + + Messages from WARNING level and higher will be logged to file and to + stderr. By default, INFO level will also be written to logfile. Both + levels are flexible. + + Level groupings: + ERROR: Error messages should be generated by an exception call + WARNING: Non-terminal behavior that may be unusual (i.e. lists + with no values, blank strings) + INFO: Every major (user-facing) function should have the following: + -Message for function start + -List of input arguments and options + -Basic sanity checks (dimensions of input data) + -Statements before or after major function calls + -Message for function end + DEBUG: Mainly for developer use/debugging. Generate logs for + sub-functions that match INFO level for major functions. + Possible care should be used if there are lots of loops called. + + Use: Call with either the individual function (in __name__=="__main__" + statement) or in pipeline file. + + Parameters + ---------- + filename : str ("pipeline.log") + Name of file that log will be written to + + filelevel : {'INFO','DEBUG','WARNING','ERROR'} + Set minimum level of log messages that are written to log file. + Note that this acts as a de facto minumum for 'streamlevel' as well. + + streamlevel : {'WARNING','DEBUG','INFO','ERROR'} + Set minimum level of log messages that are output to stream. + + resetlog : bool (True) + If true, will overwrite logfile when opening. Set to false if log is + being initialized multiple times + + Returns + ------- + None + + Exceptions + ---------- + If filelevel or streamlevel are not a valid logging level + + """ + log_levels = ['DEBUG','INFO','WARNING','ERROR'] + if filelevel is not None and filelevel.upper() not in log_levels: + raise Exception('filelevel value %s is not a valid level' % + filelevel) + if streamlevel is not None and streamlevel.upper() not in log_levels: + raise Exception('streamlevel value %s is not a valid level' % + streamlevel) + fmt_def = "%(asctime)s - %(funcName)s - %(levelname)s: %(message)s" + fmt_notime = "%(funcName)s - %(levelname)s: %(message)s" + fmtr = logging.Formatter(fmt=fmt_def) + fmtr_notime = logging.Formatter(fmt=fmt_notime) + filelogger = logging.getLogger() + filelogger.setLevel('INFO') + if streamlevel is not None: + s_handler = logging.StreamHandler() + s_handler.setFormatter(fmtr_notime) + s_handler.setLevel(streamlevel) + filelogger.addHandler(s_handler) + logmode = 'a' + if resetlog: + logmode = 'w' + if filelevel is not None: + f_handler = logging.FileHandler(filename,mode=logmode) + f_handler.setFormatter(fmtr) + f_handler.setLevel(filelevel) + #filelogger.setLevel(filelevel) + filelogger.addHandler(f_handler) + #Formats exception messages to be sent to appropriate loggers + def exp_handler(etype,val,tb): + logging.error("%s" % (val), exc_info=(etype,val,tb)) + + sys.excepthook = exp_handler + + +def logArgs(args, func_name=None, print_nones=False): + header = "Arguments" + if func_name is not None: + header+=" for"+func_name + for arg in vars(args): + val = vars(args)[arg] + if val is not None or print_nones: + logging.info('Argument %s: %s' % (arg,val))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_reader_func.py Fri Oct 12 14:14:52 2018 -0400 @@ -0,0 +1,474 @@ +import sys +import pysam +import logging +import struct +from random import sample +from collections import defaultdict +import os +import gzip + +def checkIfGzip(filename): + try: + gf = gzip.open(filename) + gl = gf.readline() + gf.close() + vcf_check = b'##fileformat=VCF' + if gl[0:3] == b'BCF': + return 'bcf' + elif gl[:len(vcf_check)] == vcf_check: + return checkHeader(filename) + else: + return 'other' + except: + return 'nozip' + +def checkHeader(filename): + f = open(filename,'rb') + l = f.readline() + f.close() + BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00' + #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff' + GZF_HEADER=b'\x1f\x8b' + if l[:len(BGZF_HEADER)] == BGZF_HEADER: + return 'bgzip' + if l[:len(GZF_HEADER)] == GZF_HEADER: + return 'gzip' + return 'nozip' + +def checkFormat(vcfname): + """Checks header of given file for compression type + + + Given a filename, opens file and reads first line to check if + file has BGZF or GZIP header. May be extended to check for BCF format + + Parameters + ---------- + filename : str + Name of file to be checked + + Returns + ------- + extension : str {'bgzip','gzip','vcf','other'} + File extension as indicated by header + + """ + typ = checkIfGzip(vcfname) + if typ != 'nozip': + return typ + f = open(vcfname) + l = f.readline() + f.close() + VCF_TAG='##fileformat=VCF' + if l[:len(VCF_TAG)] == VCF_TAG: + return 'vcf' + return 'other' + +def checkIfCpG(record,fasta_ref,offset=0,add_chr=False): + dr = None + pos = record.pos + c = record.chrom + if record.alts is None: + return False + if add_chr: + c = 'chr'+record.chrom + if record.ref == 'C' and 'T' in record.alts: + seq = fasta_ref.fetch(c,pos-1,pos+1) + if seq[0].upper() != 'C': + logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0])) + #raise Exception("checkIfCpG function not lining up properly") + if seq[1].upper() == 'G': + return True + return False + elif record.ref == 'G' and 'A' in record.alts: + seq = fasta_ref.fetch(c,pos-2,pos) + if seq[1].upper() != 'G': + logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1])) + #raise Exception("checkIfCpg function not lining up on negative strand") + if seq[0].upper() == 'C': + return True + return False + return False + +def checkForDuplicates(rec_list,pass_list): + for i in range(len(rec_list)-1): + if rec_list[i].pos == rec_list[i+1].pos: + pass_list[i] = False + pass_list[i+1] = False + +def checkForMultiallele(rec_list,pass_list): + for i in range(len(rec_list)): + if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos: + pass_list[i] = False + pass_list[i+1] = False + if len(rec_list[i].alleles) > 2: + pass_list[i] = False + +def flipChrom(chrom): + if chrom[0:3] == 'chr': + return chrom[0:3] + return 'chr'+chrom + +def getAlleleCountDict(rec): + alleles = defaultdict(int) + total_sites = 0 + missing_inds = 0 + for j in range(len(rec.samples)): + samp = rec.samples[j] + if None in samp.alleles: + missing_inds += 1 + for k in range(len(samp.alleles)): + b = samp.alleles[k] + if b is not None: + alleles[b] += 1 + total_sites+=1 + return alleles, total_sites, missing_inds + +def isInformative(rec, mincount=2, alleles=None): + count = 0 + if alleles is None: + alleles, total_sites, missing_inds = getAlleleCountDict(rec) + if len(alleles) != 2: + return False + i1,i2 = alleles.keys() + return (alleles[i1] >= mincount and alleles[i2] >= mincount) + +def getPassSites(record_list, remove_cpg=False, remove_indels=True, + remove_multiallele=True, remove_missing=0, + inform_level=2, fasta_ref=None): + pass_list = [True for r in record_list] + if remove_cpg == True and fasta_ref is None: + raise Exception("CpG removal requires a reference") + if inform_level > 2 or inform_level < 0: + raise Exception("Inform level %d must be between 0 and 2" % inform_level) + if remove_multiallele: + checkForMultiallele(record_list,pass_list) + for i in range(len(record_list)): + rec = record_list[i] + logging.info(rec.pos) + if remove_indels and not checkRecordIsSnp(rec): + pass_list[i] = False + if remove_cpg and checkIfCpG(rec,fasta_ref): + pass_list[i] = False + alleles,total_sites,missing_inds = getAlleleCountDict(rec) + if remove_missing != -1 and missing_inds > remove_missing: + pass_list[i] = False + if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles): + pass_list[i] = False + #pp = zip([r.pos for r in record_list],pass_list) + #for ppp in pp: + # logging.info(ppp) + return pass_list + + +def filterSites(record_list, remove_cpg=False, remove_indels=True, + remove_multiallele=True, remove_missing=0, inform_level=2, + fasta_ref=None): + pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref) + out_list = [] + for i in range(len(pass_list)): + if pass_list[i]: + out_list.append(record_list[i]) + return out_list + + + +class VcfReader(): + def __init__(self, vcfname, compress_flag=False, subsamp_num=None, + subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False): + + ext = checkFormat(vcfname) + if ext in ['gzip','other'] : + raise Exception(('Input file %s is gzip-formatted, must be either ' + 'uncompressed or zipped with bgzip' % vcfname)) + self.file_uncompressed = (ext == 'vcf') + self.reader_uncompressed = (self.file_uncompressed and not compress_flag) + self.popmodel = None + self.popkeys = None + if popmodel is not None and use_allpop: + raise Exception("Popmodel and allpop cannot both be specified") + if compress_flag and file_uncompressed: + vcfname = compressVcf(vcfname) + if subsamp_num is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_list = getSubsampleList(vcfname, subsamp_num) + elif subsamp_fn is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_file = open(subsamp_fn,'r') + subsamp_list = [l.strip() for l in subsamp_file.readlines()] + subsamp_file.close() + + if index is None: + self.reader = pysam.VariantFile(vcfname) + else: + self.reader = pysam.VariantFile(vcfname, index_filename=index) + if popmodel is not None: + self.popmodel = popmodel + popsamp_list = popmodel.inds + self.reader.subset_samples(popsamp_list) + self.setPopIdx() + if use_allpop: + self.setAllPop() + if subsamp_list is not None: + logging.debug('Subsampling %d individuals from VCF file' % + (len(subsamp_list))) + self.reader.subset_samples(subsamp_list) + self.prev_last_rec = next(self.reader) + self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr') + + def fetch(self, chrom=None, start=None, end=None): + return self.reader.fetch(chrom, start, end) + + def getRecordList(self, region=None, chrom=None, start=None, + end=None): + if self.reader_uncompressed: + ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom) + return ret + else: + return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom) + + def setPopIdx(self): + self.popkeys = {} + sample_names = [l for l in self.reader.header.samples] + for p in self.popmodel.pop_list: + self.popkeys[p] = [] + for ind in self.popmodel.ind_dict[p]: + self.popkeys[p].append(sample_names.index(ind)) + + def close(self): + self.reader.close() + + def setAllPop(self): + self.popkeys = {'ALL':[]} + for i in range(len(self.reader.header.samples)): + self.popkeys['ALL'].append(i) + +def modChrom(c,vcf_chr): + if c is None: + return None + if vcf_chr and c[:3] != 'chr': + return 'chr'+c + if not vcf_chr and c[:3] == 'chr': + return c[3:] + return c + +def getRecordList(vcf_reader, region=None, chrom=None, start=None, + end=None, add_chr=False): + """Returns list for use in subsampling from input file""" + if region is not None: + c = modChrom(region.chrom,add_chr) + var_sites = vcf_reader.fetch(c, region.start, region.end) + else: + c = modChrom(chrom,add_chr) + var_sites = vcf_reader.fetch(c, start, end) + lst = [] + for rec in var_sites: + lst.append(rec) + return lst + + +def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None, + start=None, end=None, add_chr=False): + """Method for getting record list from unzipped VCF file. + + This method will sequentially look through a VCF file until it finds + the given `start` position on `chrom`, then add all records to a list + until the `end` position has been reached. Note that `prev_last_rec` + must be kept track of externally to ensure that if consecutive regions + are called, the record of the first variant outside the first region + is not lost. + + Parameters + ---------- + vcf_reader : pysam VariantFile object + VCF reader initialized from other function + region : region object + Region object with start and end coordinates of region of interest. + prev_last_rec : VariantRecord object + Variable with last record read from VcfReader. Stored here so that + if two adjacent regions are called, the overflow record from the + first region is still included in the next region + + Returns + ------- + lst : list + List of records in given gene region + prev_last_rec : VariantRecord object + First record after target region, for use in next call + + """ + lst = [] + if region is None: + lst.append(prev_last_rec) + for rec in vcf_reader: + lst.append(rec) + return lst, lst[-1] + + if (prev_last_rec is not None and + region.containsRecord(prev_last_rec) == 'in'): + lst.append(prev_last_rec) + elif (prev_last_rec is not None and + region.containsRecord(prev_last_rec) == 'after'): + return [] + rec = next(vcf_reader,None) + if rec is None: + return lst,None + place = region.containsRecord(rec) + while rec is not None and place != 'after': + if place == 'in': + lst.append(rec) + rec = next(vcf_reader,None) + if rec is None: + break + place = region.containsRecord(rec) + prev_last_rec = rec + return lst, prev_last_rec + + +def checkRecordIsSnp(rec): + """Checks if this record is a single nucleotide variant, returns bool.""" + if len(rec.ref) != 1: + return False + if rec.alts is None: + return False + for allele in rec.alts: + if len(allele) != 1: + return False + return True + + +def getSubsampleList(vcfname, ss_count): + """Returns a list of the first `ss_count` individuals in `vcfname` + + """ + + vcf_o = pysam.VariantFile(vcfname) + rec = next(vcf_o) + vcf_o.close() + lst = [] + for samp in rec.samples: + lst.append(samp) + return lst[:int(ss_count)] + + +def compressVcf(vcfname,forceflag=False,remove=False): + """Runs bgzip and tabix on input VCF file. + + Using the pysam library, this function runs the bgzip and tabix utilities + on the given input file. By default, this will not overwrite an existing + zipped file, but will overwrite an existing index. `remove` can be set to + delete the unzipped file. + + Parameters + ---------- + vcfname : str + Name of uncompressed VCF file + forceflag : bool (False) + If true, will overwrite (vcfname).gz if it exists + remove : bool (False) + If true, will delete uncompressed source file + + Returns + ------- + cvcfname : str + Filepath to compressed VCF file + """ + cvcfname = vcfname+".gz" + pysam.tabix_compress(vcfname,cvcfname,force=forceflag) + pysam.tabix_index(cvcfname,preset="vcf",force=True) + if remove: + os.remove(vcfname) + return cvcfname + +def vcfRegionName(prefix, region, ext, oneidx=False, + halfopen=True, sep='-'): + chrom = region.toStr(halfopen, oneidx, sep) + return prefix+'_'+chrom+'.'+ext + +def getRecordsInRegion(region, record_list): + sub_list = [] + for i in range(len(record_list)): + loc = region.containsRecord(record_list[i]) + if loc == "in": + sub_list.append(record_list[i]) + elif loc == "after": + break + return sub_list + + + + + +#def getVcfReader(args): +def getVcfReader(vcfname, compress_flag=False, subsamp_num=None, + subsamp_fn=None, subsamp_list=None, index=None): + """Returns a reader for a given input VCF file. + + Given a filename, filetype, compression option, and optional Subsampling + options, will return a pysam.VariantFile object for iteration and + a flag as to whether this file is compressed or uncompressed. + + Parameters + ---------- + vcfname : str + Filename for VCF file. The extension of this file will be used to + determine whether it is compressed or not unless `var_ext` is set. + var_ext : str (None) + Extension for VCF file if it is not included in the filename. + compress_flag : bool (False) + If filetype is uncompressed and this is set to true, will run + compressVcf function. + subsamp_num : int (None) + If set, will randomly select `subsamp_num` individuals (not + genotypes) from the input VCF file and return a reader with + only those data. + subsamp_fn : str (None) + If set, will return a reader with only data from the samples listed + in the file provided. Cannot be used with other subsampling options. + subsamp_list : list (None) + If set, will return reader with records containing only + individuals named in the list. Cannot be used with other subsampling + options. + + Returns + ------- + vcf_reader : pysam.VariantFile + A reader that can be iterated through for variant records. If + compressed, it will be able to use the pysam fetch method, otherwise + it must be read through sequentially + reader_uncompressed : bool + If True, VCF reader is uncompressed. This means the fetch method + cannot be used and region access must be done using the + "getRecordListUnzipped" method. + + """ + ext = checkFormat(vcfname) + if ext in ['gzip','other'] : + raise Exception(('Input file %s is gzip-formatted, must be either ' + 'uncompressed or zipped with bgzip' % vcfname)) + file_uncompressed = (ext == 'vcf') + reader_uncompressed = (file_uncompressed and not compress_flag) + if compress_flag and file_uncompressed: + vcfname = compressVcf(vcfname) + #subsamp_list = None + if subsamp_num is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_list = getSubsampleList(vcfname, subsamp_num) + elif subsamp_fn is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_file = open(subsamp_fn,'r') + subsamp_list = [l.strip() for l in subsamp_file.readlines()] + subsamp_file.close() + if index is None: + vcf_reader = pysam.VariantFile(vcfname) + else: + vcf_reader = pysam.VariantFile(vcfname, index_filename=index) + if subsamp_list is not None: + logging.debug('Subsampling %d individuals from VCF file' % + (len(subsamp_list))) + vcf_reader.subset_samples(subsamp_list) + return vcf_reader, reader_uncompressed
