annotate vcf_reader_func.py @ 5:86a9d8d5b291 draft default tip

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:34:34 -0400
parents 3830d29fca6a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
1 import sys
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
2 import pysam
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
3 import logging
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
4 import struct
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
5 from random import sample
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
6 from collections import defaultdict
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
7 import os
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
8 import gzip
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
9
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
10 def checkIfGzip(filename):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
11 try:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
12 gf = gzip.open(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
13 gl = gf.readline()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
14 gf.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
15 vcf_check = b'##fileformat=VCF'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
16 if gl[0:3] == b'BCF':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
17 return 'bcf'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
18 elif gl[:len(vcf_check)] == vcf_check:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
19 return checkHeader(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
20 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
21 return 'other'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
22 except:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
23 return 'nozip'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
24
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
25 def checkHeader(filename):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
26 f = open(filename,'rb')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
27 l = f.readline()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
28 f.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
31 GZF_HEADER=b'\x1f\x8b'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
33 return 'bgzip'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
34 if l[:len(GZF_HEADER)] == GZF_HEADER:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
35 return 'gzip'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
36 return 'nozip'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
37
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
38 def checkFormat(vcfname):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
39 """Checks header of given file for compression type
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
40
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
41
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
42 Given a filename, opens file and reads first line to check if
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
43 file has BGZF or GZIP header. May be extended to check for BCF format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
44
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
45 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
46 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
47 filename : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
48 Name of file to be checked
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
49
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
50 Returns
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
51 -------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
52 extension : str {'bgzip','gzip','vcf','other'}
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
53 File extension as indicated by header
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
54
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
55 """
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
56 typ = checkIfGzip(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
57 if typ != 'nozip':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
58 return typ
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
59 f = open(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
60 l = f.readline()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
61 f.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
62 VCF_TAG='##fileformat=VCF'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
63 if l[:len(VCF_TAG)] == VCF_TAG:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
64 return 'vcf'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
65 return 'other'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
66
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
68 dr = None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
69 pos = record.pos
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
70 c = record.chrom
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
71 if record.alts is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
72 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
73 if add_chr:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
74 c = 'chr'+record.chrom
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
75 if record.ref == 'C' and 'T' in record.alts:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
76 seq = fasta_ref.fetch(c,pos-1,pos+1)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
77 if seq[0].upper() != 'C':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
79 #raise Exception("checkIfCpG function not lining up properly")
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
80 if seq[1].upper() == 'G':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
81 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
82 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
83 elif record.ref == 'G' and 'A' in record.alts:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
84 seq = fasta_ref.fetch(c,pos-2,pos)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
85 if seq[1].upper() != 'G':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
87 #raise Exception("checkIfCpg function not lining up on negative strand")
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
88 if seq[0].upper() == 'C':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
89 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
90 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
91 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
92
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
93 def checkForDuplicates(rec_list,pass_list):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
94 for i in range(len(rec_list)-1):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
95 if rec_list[i].pos == rec_list[i+1].pos:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
96 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
97 pass_list[i+1] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
98
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
99 def checkForMultiallele(rec_list,pass_list):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
100 for i in range(len(rec_list)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
102 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
103 pass_list[i+1] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
104 if len(rec_list[i].alleles) > 2:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
105 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
106
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
107 def flipChrom(chrom):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
108 if chrom[0:3] == 'chr':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
109 return chrom[0:3]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
110 return 'chr'+chrom
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
111
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
112 def getAlleleCountDict(rec):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
113 alleles = defaultdict(int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
114 total_sites = 0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
115 missing_inds = 0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
116 for j in range(len(rec.samples)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
117 samp = rec.samples[j]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
118 if None in samp.alleles:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
119 missing_inds += 1
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
120 for k in range(len(samp.alleles)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
121 b = samp.alleles[k]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
122 if b is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
123 alleles[b] += 1
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
124 total_sites+=1
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
125 return alleles, total_sites, missing_inds
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
126
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
127 def isInformative(rec, mincount=2, alleles=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
128 count = 0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
129 if alleles is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
130 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
131 if len(alleles) != 2:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
132 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
133 i1,i2 = alleles.keys()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
134 return (alleles[i1] >= mincount and alleles[i2] >= mincount)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
135
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
136 def getPassSites(record_list, remove_cpg=False, remove_indels=True,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
137 remove_multiallele=True, remove_missing=0,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
138 inform_level=2, fasta_ref=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
139 pass_list = [True for r in record_list]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
140 if remove_cpg == True and fasta_ref is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
141 raise Exception("CpG removal requires a reference")
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
142 if inform_level > 2 or inform_level < 0:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
143 raise Exception("Inform level %d must be between 0 and 2" % inform_level)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
144 if remove_multiallele:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
145 checkForMultiallele(record_list,pass_list)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
146 for i in range(len(record_list)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
147 rec = record_list[i]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
148 logging.info(rec.pos)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
149 if remove_indels and not checkRecordIsSnp(rec):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
150 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
151 if remove_cpg and checkIfCpG(rec,fasta_ref):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
152 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
153 alleles,total_sites,missing_inds = getAlleleCountDict(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
154 if remove_missing != -1 and missing_inds > remove_missing:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
155 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
156 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
157 pass_list[i] = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
158 #pp = zip([r.pos for r in record_list],pass_list)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
159 #for ppp in pp:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
160 # logging.info(ppp)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
161 return pass_list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
162
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
163
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
164 def filterSites(record_list, remove_cpg=False, remove_indels=True,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
165 remove_multiallele=True, remove_missing=0, inform_level=2,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
166 fasta_ref=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
167 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
168 out_list = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
169 for i in range(len(pass_list)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
170 if pass_list[i]:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
171 out_list.append(record_list[i])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
172 return out_list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
173
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
174
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
175
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
176 class VcfReader():
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
177 def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
178 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
179
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
180 ext = checkFormat(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
181 if ext in ['gzip','other'] :
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
182 raise Exception(('Input file %s is gzip-formatted, must be either '
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
183 'uncompressed or zipped with bgzip' % vcfname))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
184 self.file_uncompressed = (ext == 'vcf')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
185 self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
186 self.popmodel = None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
187 self.popkeys = None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
188 if popmodel is not None and use_allpop:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
189 raise Exception("Popmodel and allpop cannot both be specified")
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
190 if compress_flag and file_uncompressed:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
191 vcfname = compressVcf(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
192 if subsamp_num is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
193 if subsamp_list is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
194 raise Exception('Multiple subsampling options called in getVcfReader')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
195 subsamp_list = getSubsampleList(vcfname, subsamp_num)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
196 elif subsamp_fn is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
197 if subsamp_list is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
198 raise Exception('Multiple subsampling options called in getVcfReader')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
199 subsamp_file = open(subsamp_fn,'r')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
200 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
201 subsamp_file.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
202
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
203 if index is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
204 self.reader = pysam.VariantFile(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
205 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
206 self.reader = pysam.VariantFile(vcfname, index_filename=index)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
207 if popmodel is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
208 self.popmodel = popmodel
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
209 popsamp_list = popmodel.inds
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
210 self.reader.subset_samples(popsamp_list)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
211 self.setPopIdx()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
212 if use_allpop:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
213 self.setAllPop()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
214 if subsamp_list is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
215 logging.debug('Subsampling %d individuals from VCF file' %
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
216 (len(subsamp_list)))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
217 self.reader.subset_samples(subsamp_list)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
218 self.prev_last_rec = next(self.reader)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
219 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
220
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
221 def fetch(self, chrom=None, start=None, end=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
222 return self.reader.fetch(chrom, start, end)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
223
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
224 def getRecordList(self, region=None, chrom=None, start=None,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
225 end=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
226 if self.reader_uncompressed:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
227 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
228 return ret
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
229 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
230 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
231
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
232 def setPopIdx(self):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
233 self.popkeys = {}
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
234 sample_names = [l for l in self.reader.header.samples]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
235 for p in self.popmodel.pop_list:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
236 self.popkeys[p] = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
237 for ind in self.popmodel.ind_dict[p]:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
238 self.popkeys[p].append(sample_names.index(ind))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
239
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
240 def close(self):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
241 self.reader.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
242
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
243 def setAllPop(self):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
244 self.popkeys = {'ALL':[]}
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
245 for i in range(len(self.reader.header.samples)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
246 self.popkeys['ALL'].append(i)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
247
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
248 def modChrom(c,vcf_chr):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
249 if c is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
250 return None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
251 if vcf_chr and c[:3] != 'chr':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
252 return 'chr'+c
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
253 if not vcf_chr and c[:3] == 'chr':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
254 return c[3:]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
255 return c
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
256
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
257 def getRecordList(vcf_reader, region=None, chrom=None, start=None,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
258 end=None, add_chr=False):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
259 """Returns list for use in subsampling from input file"""
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
260 if region is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
261 c = modChrom(region.chrom,add_chr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
262 var_sites = vcf_reader.fetch(c, region.start, region.end)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
263 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
264 c = modChrom(chrom,add_chr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
265 var_sites = vcf_reader.fetch(c, start, end)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
266 lst = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
267 for rec in var_sites:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
268 lst.append(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
269 return lst
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
270
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
271
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
272 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
273 start=None, end=None, add_chr=False):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
274 """Method for getting record list from unzipped VCF file.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
275
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
276 This method will sequentially look through a VCF file until it finds
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
277 the given `start` position on `chrom`, then add all records to a list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
278 until the `end` position has been reached. Note that `prev_last_rec`
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
279 must be kept track of externally to ensure that if consecutive regions
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
280 are called, the record of the first variant outside the first region
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
281 is not lost.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
282
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
283 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
284 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
285 vcf_reader : pysam VariantFile object
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
286 VCF reader initialized from other function
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
287 region : region object
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
288 Region object with start and end coordinates of region of interest.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
289 prev_last_rec : VariantRecord object
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
290 Variable with last record read from VcfReader. Stored here so that
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
291 if two adjacent regions are called, the overflow record from the
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
292 first region is still included in the next region
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
293
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
294 Returns
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
295 -------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
296 lst : list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
297 List of records in given gene region
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
298 prev_last_rec : VariantRecord object
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
299 First record after target region, for use in next call
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
300
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
301 """
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
302 lst = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
303 if region is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
304 lst.append(prev_last_rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
305 for rec in vcf_reader:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
306 lst.append(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
307 return lst, lst[-1]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
308
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
309 if (prev_last_rec is not None and
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
310 region.containsRecord(prev_last_rec) == 'in'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
311 lst.append(prev_last_rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
312 elif (prev_last_rec is not None and
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
313 region.containsRecord(prev_last_rec) == 'after'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
314 return []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
315 rec = next(vcf_reader,None)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
316 if rec is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
317 return lst,None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
318 place = region.containsRecord(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
319 while rec is not None and place != 'after':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
320 if place == 'in':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
321 lst.append(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
322 rec = next(vcf_reader,None)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
323 if rec is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
324 break
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
325 place = region.containsRecord(rec)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
326 prev_last_rec = rec
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
327 return lst, prev_last_rec
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
328
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
329
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
330 def checkRecordIsSnp(rec):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
331 """Checks if this record is a single nucleotide variant, returns bool."""
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
332 if len(rec.ref) != 1:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
333 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
334 if rec.alts is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
335 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
336 for allele in rec.alts:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
337 if len(allele) != 1:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
338 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
339 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
340
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
341
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
342 def getSubsampleList(vcfname, ss_count):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
343 """Returns a list of the first `ss_count` individuals in `vcfname`
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
344
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
345 """
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
346
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
347 vcf_o = pysam.VariantFile(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
348 rec = next(vcf_o)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
349 vcf_o.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
350 lst = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
351 for samp in rec.samples:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
352 lst.append(samp)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
353 return lst[:int(ss_count)]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
354
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
355
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
356 def compressVcf(vcfname,forceflag=False,remove=False):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
357 """Runs bgzip and tabix on input VCF file.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
358
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
359 Using the pysam library, this function runs the bgzip and tabix utilities
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
360 on the given input file. By default, this will not overwrite an existing
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
361 zipped file, but will overwrite an existing index. `remove` can be set to
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
362 delete the unzipped file.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
363
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
364 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
365 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
366 vcfname : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
367 Name of uncompressed VCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
368 forceflag : bool (False)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
369 If true, will overwrite (vcfname).gz if it exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
370 remove : bool (False)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
371 If true, will delete uncompressed source file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
372
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
373 Returns
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
374 -------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
375 cvcfname : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
376 Filepath to compressed VCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
377 """
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
378 cvcfname = vcfname+".gz"
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
379 pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
380 pysam.tabix_index(cvcfname,preset="vcf",force=True)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
381 if remove:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
382 os.remove(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
383 return cvcfname
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
384
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
385 def vcfRegionName(prefix, region, ext, oneidx=False,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
386 halfopen=True, sep='-'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
387 chrom = region.toStr(halfopen, oneidx, sep)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
388 return prefix+'_'+chrom+'.'+ext
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
389
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
390 def getRecordsInRegion(region, record_list):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
391 sub_list = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
392 for i in range(len(record_list)):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
393 loc = region.containsRecord(record_list[i])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
394 if loc == "in":
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
395 sub_list.append(record_list[i])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
396 elif loc == "after":
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
397 break
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
398 return sub_list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
399
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
400
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
401
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
402
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
403
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
404 #def getVcfReader(args):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
405 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
406 subsamp_fn=None, subsamp_list=None, index=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
407 """Returns a reader for a given input VCF file.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
408
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
409 Given a filename, filetype, compression option, and optional Subsampling
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
410 options, will return a pysam.VariantFile object for iteration and
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
411 a flag as to whether this file is compressed or uncompressed.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
412
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
413 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
414 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
415 vcfname : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
416 Filename for VCF file. The extension of this file will be used to
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
417 determine whether it is compressed or not unless `var_ext` is set.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
418 var_ext : str (None)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
419 Extension for VCF file if it is not included in the filename.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
420 compress_flag : bool (False)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
421 If filetype is uncompressed and this is set to true, will run
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
422 compressVcf function.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
423 subsamp_num : int (None)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
424 If set, will randomly select `subsamp_num` individuals (not
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
425 genotypes) from the input VCF file and return a reader with
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
426 only those data.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
427 subsamp_fn : str (None)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
428 If set, will return a reader with only data from the samples listed
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
429 in the file provided. Cannot be used with other subsampling options.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
430 subsamp_list : list (None)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
431 If set, will return reader with records containing only
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
432 individuals named in the list. Cannot be used with other subsampling
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
433 options.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
434
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
435 Returns
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
436 -------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
437 vcf_reader : pysam.VariantFile
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
438 A reader that can be iterated through for variant records. If
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
439 compressed, it will be able to use the pysam fetch method, otherwise
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
440 it must be read through sequentially
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
441 reader_uncompressed : bool
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
442 If True, VCF reader is uncompressed. This means the fetch method
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
443 cannot be used and region access must be done using the
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
444 "getRecordListUnzipped" method.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
445
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
446 """
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
447 ext = checkFormat(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
448 if ext in ['gzip','other'] :
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
449 raise Exception(('Input file %s is gzip-formatted, must be either '
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
450 'uncompressed or zipped with bgzip' % vcfname))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
451 file_uncompressed = (ext == 'vcf')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
452 reader_uncompressed = (file_uncompressed and not compress_flag)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
453 if compress_flag and file_uncompressed:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
454 vcfname = compressVcf(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
455 #subsamp_list = None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
456 if subsamp_num is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
457 if subsamp_list is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
458 raise Exception('Multiple subsampling options called in getVcfReader')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
459 subsamp_list = getSubsampleList(vcfname, subsamp_num)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
460 elif subsamp_fn is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
461 if subsamp_list is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
462 raise Exception('Multiple subsampling options called in getVcfReader')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
463 subsamp_file = open(subsamp_fn,'r')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
464 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
465 subsamp_file.close()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
466 if index is None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
467 vcf_reader = pysam.VariantFile(vcfname)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
468 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
469 vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
470 if subsamp_list is not None:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
471 logging.debug('Subsampling %d individuals from VCF file' %
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
472 (len(subsamp_list)))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
473 vcf_reader.subset_samples(subsamp_list)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
474 return vcf_reader, reader_uncompressed