annotate vcf_reader_func.py @ 0:9127b49793a1 draft default tip

Uploaded
author jaredgk
date Fri, 12 Oct 2018 14:14:52 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
1 import sys
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
2 import pysam
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
3 import logging
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
4 import struct
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
5 from random import sample
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
6 from collections import defaultdict
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
7 import os
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
8 import gzip
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
9
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
10 def checkIfGzip(filename):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
11 try:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
12 gf = gzip.open(filename)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
13 gl = gf.readline()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
14 gf.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
15 vcf_check = b'##fileformat=VCF'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
16 if gl[0:3] == b'BCF':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
17 return 'bcf'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
18 elif gl[:len(vcf_check)] == vcf_check:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
19 return checkHeader(filename)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
20 else:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
21 return 'other'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
22 except:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
23 return 'nozip'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
24
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
25 def checkHeader(filename):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
26 f = open(filename,'rb')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
27 l = f.readline()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
28 f.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
29 BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
30 #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
31 GZF_HEADER=b'\x1f\x8b'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
32 if l[:len(BGZF_HEADER)] == BGZF_HEADER:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
33 return 'bgzip'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
34 if l[:len(GZF_HEADER)] == GZF_HEADER:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
35 return 'gzip'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
36 return 'nozip'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
37
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
38 def checkFormat(vcfname):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
39 """Checks header of given file for compression type
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
40
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
41
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
42 Given a filename, opens file and reads first line to check if
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
43 file has BGZF or GZIP header. May be extended to check for BCF format
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
44
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
45 Parameters
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
46 ----------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
47 filename : str
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
48 Name of file to be checked
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
49
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
50 Returns
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
51 -------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
52 extension : str {'bgzip','gzip','vcf','other'}
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
53 File extension as indicated by header
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
54
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
55 """
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
56 typ = checkIfGzip(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
57 if typ != 'nozip':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
58 return typ
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
59 f = open(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
60 l = f.readline()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
61 f.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
62 VCF_TAG='##fileformat=VCF'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
63 if l[:len(VCF_TAG)] == VCF_TAG:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
64 return 'vcf'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
65 return 'other'
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
66
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
67 def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
68 dr = None
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
69 pos = record.pos
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
70 c = record.chrom
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
71 if record.alts is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
72 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
73 if add_chr:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
74 c = 'chr'+record.chrom
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
75 if record.ref == 'C' and 'T' in record.alts:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
76 seq = fasta_ref.fetch(c,pos-1,pos+1)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
77 if seq[0].upper() != 'C':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
78 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0]))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
79 #raise Exception("checkIfCpG function not lining up properly")
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
80 if seq[1].upper() == 'G':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
81 return True
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
82 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
83 elif record.ref == 'G' and 'A' in record.alts:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
84 seq = fasta_ref.fetch(c,pos-2,pos)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
85 if seq[1].upper() != 'G':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
86 logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1]))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
87 #raise Exception("checkIfCpg function not lining up on negative strand")
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
88 if seq[0].upper() == 'C':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
89 return True
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
90 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
91 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
92
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
93 def checkForDuplicates(rec_list,pass_list):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
94 for i in range(len(rec_list)-1):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
95 if rec_list[i].pos == rec_list[i+1].pos:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
96 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
97 pass_list[i+1] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
98
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
99 def checkForMultiallele(rec_list,pass_list):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
100 for i in range(len(rec_list)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
101 if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
102 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
103 pass_list[i+1] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
104 if len(rec_list[i].alleles) > 2:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
105 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
106
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
107 def flipChrom(chrom):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
108 if chrom[0:3] == 'chr':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
109 return chrom[0:3]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
110 return 'chr'+chrom
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
111
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
112 def getAlleleCountDict(rec):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
113 alleles = defaultdict(int)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
114 total_sites = 0
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
115 missing_inds = 0
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
116 for j in range(len(rec.samples)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
117 samp = rec.samples[j]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
118 if None in samp.alleles:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
119 missing_inds += 1
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
120 for k in range(len(samp.alleles)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
121 b = samp.alleles[k]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
122 if b is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
123 alleles[b] += 1
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
124 total_sites+=1
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
125 return alleles, total_sites, missing_inds
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
126
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
127 def isInformative(rec, mincount=2, alleles=None):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
128 count = 0
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
129 if alleles is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
130 alleles, total_sites, missing_inds = getAlleleCountDict(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
131 if len(alleles) != 2:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
132 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
133 i1,i2 = alleles.keys()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
134 return (alleles[i1] >= mincount and alleles[i2] >= mincount)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
135
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
136 def getPassSites(record_list, remove_cpg=False, remove_indels=True,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
137 remove_multiallele=True, remove_missing=0,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
138 inform_level=2, fasta_ref=None):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
139 pass_list = [True for r in record_list]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
140 if remove_cpg == True and fasta_ref is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
141 raise Exception("CpG removal requires a reference")
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
142 if inform_level > 2 or inform_level < 0:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
143 raise Exception("Inform level %d must be between 0 and 2" % inform_level)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
144 if remove_multiallele:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
145 checkForMultiallele(record_list,pass_list)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
146 for i in range(len(record_list)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
147 rec = record_list[i]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
148 logging.info(rec.pos)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
149 if remove_indels and not checkRecordIsSnp(rec):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
150 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
151 if remove_cpg and checkIfCpG(rec,fasta_ref):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
152 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
153 alleles,total_sites,missing_inds = getAlleleCountDict(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
154 if remove_missing != -1 and missing_inds > remove_missing:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
155 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
156 if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
157 pass_list[i] = False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
158 #pp = zip([r.pos for r in record_list],pass_list)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
159 #for ppp in pp:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
160 # logging.info(ppp)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
161 return pass_list
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
162
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
163
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
164 def filterSites(record_list, remove_cpg=False, remove_indels=True,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
165 remove_multiallele=True, remove_missing=0, inform_level=2,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
166 fasta_ref=None):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
167 pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
168 out_list = []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
169 for i in range(len(pass_list)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
170 if pass_list[i]:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
171 out_list.append(record_list[i])
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
172 return out_list
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
173
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
174
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
175
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
176 class VcfReader():
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
177 def __init__(self, vcfname, compress_flag=False, subsamp_num=None,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
178 subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
179
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
180 ext = checkFormat(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
181 if ext in ['gzip','other'] :
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
182 raise Exception(('Input file %s is gzip-formatted, must be either '
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
183 'uncompressed or zipped with bgzip' % vcfname))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
184 self.file_uncompressed = (ext == 'vcf')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
185 self.reader_uncompressed = (self.file_uncompressed and not compress_flag)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
186 self.popmodel = None
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
187 self.popkeys = None
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
188 if popmodel is not None and use_allpop:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
189 raise Exception("Popmodel and allpop cannot both be specified")
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
190 if compress_flag and file_uncompressed:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
191 vcfname = compressVcf(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
192 if subsamp_num is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
193 if subsamp_list is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
194 raise Exception('Multiple subsampling options called in getVcfReader')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
195 subsamp_list = getSubsampleList(vcfname, subsamp_num)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
196 elif subsamp_fn is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
197 if subsamp_list is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
198 raise Exception('Multiple subsampling options called in getVcfReader')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
199 subsamp_file = open(subsamp_fn,'r')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
200 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
201 subsamp_file.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
202
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
203 if index is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
204 self.reader = pysam.VariantFile(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
205 else:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
206 self.reader = pysam.VariantFile(vcfname, index_filename=index)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
207 if popmodel is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
208 self.popmodel = popmodel
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
209 popsamp_list = popmodel.inds
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
210 self.reader.subset_samples(popsamp_list)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
211 self.setPopIdx()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
212 if use_allpop:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
213 self.setAllPop()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
214 if subsamp_list is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
215 logging.debug('Subsampling %d individuals from VCF file' %
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
216 (len(subsamp_list)))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
217 self.reader.subset_samples(subsamp_list)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
218 self.prev_last_rec = next(self.reader)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
219 self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
220
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
221 def fetch(self, chrom=None, start=None, end=None):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
222 return self.reader.fetch(chrom, start, end)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
223
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
224 def getRecordList(self, region=None, chrom=None, start=None,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
225 end=None):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
226 if self.reader_uncompressed:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
227 ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
228 return ret
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
229 else:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
230 return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
231
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
232 def setPopIdx(self):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
233 self.popkeys = {}
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
234 sample_names = [l for l in self.reader.header.samples]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
235 for p in self.popmodel.pop_list:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
236 self.popkeys[p] = []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
237 for ind in self.popmodel.ind_dict[p]:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
238 self.popkeys[p].append(sample_names.index(ind))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
239
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
240 def close(self):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
241 self.reader.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
242
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
243 def setAllPop(self):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
244 self.popkeys = {'ALL':[]}
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
245 for i in range(len(self.reader.header.samples)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
246 self.popkeys['ALL'].append(i)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
247
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
248 def modChrom(c,vcf_chr):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
249 if c is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
250 return None
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
251 if vcf_chr and c[:3] != 'chr':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
252 return 'chr'+c
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
253 if not vcf_chr and c[:3] == 'chr':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
254 return c[3:]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
255 return c
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
256
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
257 def getRecordList(vcf_reader, region=None, chrom=None, start=None,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
258 end=None, add_chr=False):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
259 """Returns list for use in subsampling from input file"""
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
260 if region is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
261 c = modChrom(region.chrom,add_chr)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
262 var_sites = vcf_reader.fetch(c, region.start, region.end)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
263 else:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
264 c = modChrom(chrom,add_chr)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
265 var_sites = vcf_reader.fetch(c, start, end)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
266 lst = []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
267 for rec in var_sites:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
268 lst.append(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
269 return lst
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
270
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
271
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
272 def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
273 start=None, end=None, add_chr=False):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
274 """Method for getting record list from unzipped VCF file.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
275
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
276 This method will sequentially look through a VCF file until it finds
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
277 the given `start` position on `chrom`, then add all records to a list
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
278 until the `end` position has been reached. Note that `prev_last_rec`
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
279 must be kept track of externally to ensure that if consecutive regions
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
280 are called, the record of the first variant outside the first region
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
281 is not lost.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
282
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
283 Parameters
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
284 ----------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
285 vcf_reader : pysam VariantFile object
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
286 VCF reader initialized from other function
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
287 region : region object
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
288 Region object with start and end coordinates of region of interest.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
289 prev_last_rec : VariantRecord object
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
290 Variable with last record read from VcfReader. Stored here so that
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
291 if two adjacent regions are called, the overflow record from the
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
292 first region is still included in the next region
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
293
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
294 Returns
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
295 -------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
296 lst : list
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
297 List of records in given gene region
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
298 prev_last_rec : VariantRecord object
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
299 First record after target region, for use in next call
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
300
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
301 """
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
302 lst = []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
303 if region is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
304 lst.append(prev_last_rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
305 for rec in vcf_reader:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
306 lst.append(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
307 return lst, lst[-1]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
308
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
309 if (prev_last_rec is not None and
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
310 region.containsRecord(prev_last_rec) == 'in'):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
311 lst.append(prev_last_rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
312 elif (prev_last_rec is not None and
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
313 region.containsRecord(prev_last_rec) == 'after'):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
314 return []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
315 rec = next(vcf_reader,None)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
316 if rec is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
317 return lst,None
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
318 place = region.containsRecord(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
319 while rec is not None and place != 'after':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
320 if place == 'in':
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
321 lst.append(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
322 rec = next(vcf_reader,None)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
323 if rec is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
324 break
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
325 place = region.containsRecord(rec)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
326 prev_last_rec = rec
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
327 return lst, prev_last_rec
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
328
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
329
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
330 def checkRecordIsSnp(rec):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
331 """Checks if this record is a single nucleotide variant, returns bool."""
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
332 if len(rec.ref) != 1:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
333 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
334 if rec.alts is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
335 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
336 for allele in rec.alts:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
337 if len(allele) != 1:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
338 return False
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
339 return True
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
340
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
341
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
342 def getSubsampleList(vcfname, ss_count):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
343 """Returns a list of the first `ss_count` individuals in `vcfname`
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
344
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
345 """
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
346
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
347 vcf_o = pysam.VariantFile(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
348 rec = next(vcf_o)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
349 vcf_o.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
350 lst = []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
351 for samp in rec.samples:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
352 lst.append(samp)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
353 return lst[:int(ss_count)]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
354
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
355
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
356 def compressVcf(vcfname,forceflag=False,remove=False):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
357 """Runs bgzip and tabix on input VCF file.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
358
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
359 Using the pysam library, this function runs the bgzip and tabix utilities
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
360 on the given input file. By default, this will not overwrite an existing
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
361 zipped file, but will overwrite an existing index. `remove` can be set to
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
362 delete the unzipped file.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
363
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
364 Parameters
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
365 ----------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
366 vcfname : str
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
367 Name of uncompressed VCF file
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
368 forceflag : bool (False)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
369 If true, will overwrite (vcfname).gz if it exists
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
370 remove : bool (False)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
371 If true, will delete uncompressed source file
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
372
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
373 Returns
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
374 -------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
375 cvcfname : str
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
376 Filepath to compressed VCF file
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
377 """
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
378 cvcfname = vcfname+".gz"
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
379 pysam.tabix_compress(vcfname,cvcfname,force=forceflag)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
380 pysam.tabix_index(cvcfname,preset="vcf",force=True)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
381 if remove:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
382 os.remove(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
383 return cvcfname
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
384
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
385 def vcfRegionName(prefix, region, ext, oneidx=False,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
386 halfopen=True, sep='-'):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
387 chrom = region.toStr(halfopen, oneidx, sep)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
388 return prefix+'_'+chrom+'.'+ext
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
389
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
390 def getRecordsInRegion(region, record_list):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
391 sub_list = []
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
392 for i in range(len(record_list)):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
393 loc = region.containsRecord(record_list[i])
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
394 if loc == "in":
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
395 sub_list.append(record_list[i])
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
396 elif loc == "after":
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
397 break
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
398 return sub_list
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
399
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
400
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
401
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
402
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
403
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
404 #def getVcfReader(args):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
405 def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
406 subsamp_fn=None, subsamp_list=None, index=None):
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
407 """Returns a reader for a given input VCF file.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
408
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
409 Given a filename, filetype, compression option, and optional Subsampling
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
410 options, will return a pysam.VariantFile object for iteration and
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
411 a flag as to whether this file is compressed or uncompressed.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
412
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
413 Parameters
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
414 ----------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
415 vcfname : str
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
416 Filename for VCF file. The extension of this file will be used to
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
417 determine whether it is compressed or not unless `var_ext` is set.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
418 var_ext : str (None)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
419 Extension for VCF file if it is not included in the filename.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
420 compress_flag : bool (False)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
421 If filetype is uncompressed and this is set to true, will run
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
422 compressVcf function.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
423 subsamp_num : int (None)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
424 If set, will randomly select `subsamp_num` individuals (not
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
425 genotypes) from the input VCF file and return a reader with
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
426 only those data.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
427 subsamp_fn : str (None)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
428 If set, will return a reader with only data from the samples listed
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
429 in the file provided. Cannot be used with other subsampling options.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
430 subsamp_list : list (None)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
431 If set, will return reader with records containing only
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
432 individuals named in the list. Cannot be used with other subsampling
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
433 options.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
434
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
435 Returns
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
436 -------
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
437 vcf_reader : pysam.VariantFile
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
438 A reader that can be iterated through for variant records. If
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
439 compressed, it will be able to use the pysam fetch method, otherwise
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
440 it must be read through sequentially
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
441 reader_uncompressed : bool
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
442 If True, VCF reader is uncompressed. This means the fetch method
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
443 cannot be used and region access must be done using the
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
444 "getRecordListUnzipped" method.
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
445
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
446 """
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
447 ext = checkFormat(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
448 if ext in ['gzip','other'] :
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
449 raise Exception(('Input file %s is gzip-formatted, must be either '
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
450 'uncompressed or zipped with bgzip' % vcfname))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
451 file_uncompressed = (ext == 'vcf')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
452 reader_uncompressed = (file_uncompressed and not compress_flag)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
453 if compress_flag and file_uncompressed:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
454 vcfname = compressVcf(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
455 #subsamp_list = None
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
456 if subsamp_num is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
457 if subsamp_list is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
458 raise Exception('Multiple subsampling options called in getVcfReader')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
459 subsamp_list = getSubsampleList(vcfname, subsamp_num)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
460 elif subsamp_fn is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
461 if subsamp_list is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
462 raise Exception('Multiple subsampling options called in getVcfReader')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
463 subsamp_file = open(subsamp_fn,'r')
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
464 subsamp_list = [l.strip() for l in subsamp_file.readlines()]
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
465 subsamp_file.close()
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
466 if index is None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
467 vcf_reader = pysam.VariantFile(vcfname)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
468 else:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
469 vcf_reader = pysam.VariantFile(vcfname, index_filename=index)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
470 if subsamp_list is not None:
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
471 logging.debug('Subsampling %d individuals from VCF file' %
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
472 (len(subsamp_list)))
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
473 vcf_reader.subset_samples(subsamp_list)
9127b49793a1 Uploaded
jaredgk
parents:
diff changeset
474 return vcf_reader, reader_uncompressed