annotate bcftools.py @ 5:86a9d8d5b291 draft default tip

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:34:34 -0400
parents 3830d29fca6a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
1 import os
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
2 import sys
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
3 import logging
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
4 import subprocess
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
5
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
7
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
8 from vcf_reader_func import checkFormat
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
9
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
10 def return_output_format_args (output_format):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
11 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
12 Return bcftools arguments for output format
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
13
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
14 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
15 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
16 output_format : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
17 The specified output format
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
18
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
19 Raises
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
20 ------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
21 Exception
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
22 If output format is unsupported by bcftools
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
23 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
24
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
25 # Return the output format arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
26 if output_format == 'vcf':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
27 return ['-O', 'v']
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
28 elif output_format == 'bcf':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
29 return ['-O', 'b']
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
30 elif output_format == 'vcf.gz':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
31 return ['-O', 'z']
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
32 else:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
33 raise Exception('Unsupported file format')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
34
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
35 def check_bcftools_for_errors (bcftools_stderr):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
36 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
37 Checks the bgzip stderr for errors
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
38
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
39 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
40 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
41 bcftools_stderr : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
42 bcftools stderr
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
43
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
44 Raises
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
45 ------
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
46 Exception
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
47 If bcftools stderr returns an error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
48 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
49
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
50 # Expand as errors are discovered
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
51
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
52 # Log warning messages
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
53 if 'W::' in bcftools_stderr:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
54 logging.warning(bcftools_stderr.strip())
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
55
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
56 # Report errors that are not warnings
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
57 elif bcftools_stderr:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
58 raise Exception(bcftools_stderr)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
59
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
60 def pipe_bcftools (bcftools_call_args):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
61 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
62 Calls bcftools with pipe output
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
63
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
64 The output of this function is the stdout and stderr of bcftools. This
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
65 function should only be used if bcftools is being used as the stdin of
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
66 another function. Please note that this function does not check the for
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
67 errors in the bcftools call. Please check for errors after the call is
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
68 closed using check_bcftools_for_errors.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
69
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
70 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
71 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
72 bcftools_stderr : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
73 bcftools stderr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
74
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
75 Returns
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
76 -------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
77 bcftools_call : PIPE
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
78 Pipe of subprocess call, including both stdout and stderr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
79
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
80 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
81
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
82 # bcftools subprocess call
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
83 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
84
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
85 return bcftools_call
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
86
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
87 def pipe_bcftools_to_chr (vcf_filename):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
88 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
89 Pipes chromosome and/or contig output of bcftools to a list of unique
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
90 entries
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
91
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
92 The purpose of this function is to return a list of the unique
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
93 chromosomes and/or contigs for use in other functions.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
94
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
95 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
96 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
97 vcf_filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
98 VCF input
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
99
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
100 Returns
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
101 -------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
102 chromosomes_to_return : list
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
103 Unique chromosomes and/or contigs within VCF input
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
104 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
105
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
106 # Open bcftools pipe
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
107 bcftools_call = pipe_bcftools(['query', '-f', '%CHROM\n', vcf_filename])
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
108
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
109 # Create a set to hold unique chromosome
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
110 chromosomes_to_return = set()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
111
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
112 try:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
113
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
114 # Current chromosomes/contigs, reduces duplicates if VCF is sorted
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
115 previous_chr = None
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
116
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
117 # Iterate the bcftools stdout unless error occurs
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
118 for bcftools_stdout_line in iter(bcftools_call.stdout.readline, b''):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
119 # Remove the newline character
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
120 bcftools_line_chr = bcftools_stdout_line.strip()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
121 # Check if the bcftools bcftools chr is different from stored chr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
122 if bcftools_line_chr != previous_chr:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
123 # Store the new chr for comparisons to reduce duplicates
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
124 previous_chr = bcftools_line_chr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
125 # Save the chr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
126 chromosomes_to_return.add(bcftools_line_chr)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
127
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
128 except:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
129 raise Exception('bcftools call error')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
130
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
131 # Close the bcftools stdout
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
132 bcftools_call.stdout.close()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
133
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
134 # Wait for bctools to finish
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
135 bcftools_call.wait()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
136
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
137 # Read the bcftools stderr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
138 bcftools_stderr = bcftools_call.stderr.read()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
139
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
140 # Check if code is running in python 3
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
141 if sys.version_info[0] == 3:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
142 # Convert bytes to string
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
143 bcftools_stderr = bcftools_stderr.decode()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
144
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
145 # Check that the log file was created correctly
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
146 check_bcftools_for_errors(bcftools_stderr)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
147
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
148 logging.info('bcftools call complete')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
149
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
150 return list(chromosomes_to_return)
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
151
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
152 def call_bcftools (bcftools_call_args):
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
153 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
154 Calls bcftools
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
155
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
156 The function calls bcftools.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
157
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
158 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
159 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
160 bcftools_call_args : list
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
161 bcftools arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
162
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
163 Returns
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
164 -------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
165 vcftools_err : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
166 vcftools log output
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
167
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
168 Raises
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
169 ------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
170 Exception
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
171 If bcftools stderr returns an error
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
172 '''
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
173
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
174 # bcftools subprocess call
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
175 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
176
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
177 # Wait for bcftools to finish
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
178 bcftools_stdout, bcftools_stderr = bcftools_call.communicate()
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
179
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
180 # Check if code is running in python 3
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
181 if sys.version_info[0] == 3:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
182 # Convert bytes to string
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
183 bcftools_stderr = bcftools_stderr.decode()
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
184
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
185 check_bcftools_for_errors(bcftools_stderr)
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
186
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
187 logging.info('bcftools call complete')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
188
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
189 return bcftools_stderr
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
190
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
191 def check_for_index (filename):
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
192 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
193 Checks for index file
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
194
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
195 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) the
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
196 function will return either True (i.e. index found) or False. However,
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
197 if the file is a VCF the function will return None (as VCF files cannot
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
198 have an index). An error is returned if the file is either a
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
199 gzipped-VCF file or not a VCF-based format.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
200
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
201 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
202 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
203 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
204 Filename of VCF-formatted file
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
205
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
206 Returns
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
207 -------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
208 bool, None
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
209 Returns bool for VCF.GZ and BCF files. Returns None for VCF files
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
210
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
211 Raises
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
212 ------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
213 Exception
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
214 If the file is a gzipped-VCF or of an unknown format
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
215 '''
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
216
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
217 # Assign the file format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
218 file_format = checkFormat(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
219
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
220 # Check if the file to be indexed is a vcf.gz
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
221 if file_format == 'bgzip':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
222 # Check if the index (.tbi) exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
223 if os.path.isfile(filename + '.tbi'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
224 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
225
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
226 # Check if the file to be indexed is a bcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
227 elif file_format == 'bcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
228 # Check if the index (.csi) exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
229 if os.path.isfile(filename + '.csi'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
230 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
231
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
232 # Check if the file is vcf (does not need an index)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
233 elif file_format == 'vcf':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
234 return None
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
235
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
236 # Check if the file is gzip-compressed vcf (cannot have an index)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
237 elif file_format == 'gzip':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
238 raise Exception('GZIP-compressed VCF files do not support index files.')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
239
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
240 # Check if the file is an unknown format
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
241 else:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
242 raise Exception('Unknown file format')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
243
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
244 # Return false if no index is found
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
245 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
246
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
247 def delete_index (filename):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
248 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
249 Deletes an index file
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
250
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
251 If the file is capable of having an index (i.e. bgzipped-VCF or BCF)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
252 this function will delete the index. However, if the file is either a
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
253 VCF or a gzip-compressed VCF the function will return an error. The
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
254 function also results in an error if the index cannot be found. This
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
255 function should be used following check_for_index.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
256
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
257 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
258 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
259 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
260 Filename of VCF-formatted file
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
261
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
262 Raises
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
263 ------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
264 Exception
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
265 No index file could be found
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
266 Exception
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
267 If the file is a gzipped-VCF or a VCF
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
268 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
269
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
270 # Assign the file format
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
271 file_format = checkFormat(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
272
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
273 # Check if the file to be indexed is a vcf.gz
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
274 if file_format == 'bgzip':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
275 # Check if the index (.tbi) exists
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
276 if os.path.isfile(filename + '.tbi'):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
277 # Delete the index
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
278 os.remove(filename + '.tbi')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
279 return
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
280
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
281 # Check if the file to be indexed is a bcf
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
282 elif file_format == 'bcf':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
283 # Check if the index (.csi) exists
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
284 if os.path.isfile(filename + '.csi'):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
285 # Delete the index
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
286 os.remove(filename + '.csi')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
287 return
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
288
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
289 # Check if the file is vcf (cannot have an index)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
290 elif file_format == 'vcf':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
291 raise Exception('VCF format does not support index files.')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
292
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
293 # Check if the file is gzip-compressed vcf (cannot have an index)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
294 elif file_format == 'gzip':
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
295 raise Exception('GZIP-compressed VCF files do not support index files.')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
296
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
297 # Return error if no index is found
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
298 raise Exception('No index file found.')
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
299
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
300 def create_index (filename):
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
301 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
302 Creates an index file
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
303
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
304 If the file is capable of having an index (i.e. bgzipped-VCF or BCF)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
305 this function will create an index file. However, if the file is a
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
306 different format the function will return an error.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
307
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
308 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
309 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
310 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
311 Filename of VCF-formatted file
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
312
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
313 Raises
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
314 ------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
315 Exception
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
316 If the file is not a bgzipped-VCF or BCF
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
317 '''
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
318
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
319 # Assign the file format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
320 file_format = checkFormat(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
321
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
322 # Check if the file to be indexed is a vcf.gz
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
323 if file_format == 'bgzip':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
324 # Create a index (.tbi)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
325 call_bcftools(['index', '-t', filename])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
326
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
327 # Check if the file to be indexed is a bcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
328 elif file_format == 'bcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
329 # Create a index (.csi)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
330 call_bcftools(['index', '-c', filename])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
331
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
332 # Report if file cannot be indexed
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
333 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
334 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
335
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
336 def chr_subset_file (filename, chromosome, output_prefix, output_format, from_bp = None, to_bp = None):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
337 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
338 Creates chromosome subset
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
339
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
340 This function is used to create a VCF-formatted subset with only
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
341 the data from a single chromosome.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
342
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
343 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
344 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
345 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
346 Filename of VCF-formatted input
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
347 chromosome : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
348 Chromosome to subset
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
349 output_prefix : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
350 Prefix of the VCF-formatted output (i.e. without file extension)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
351 output_format : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
352 The format of the output (e.g. vcf, bcf, vcf.gz)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
353 from_bp : int, optional
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
354 Lower bound of sites to include
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
355 to_bp : int, optional
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
356 Upper bound of sites to include
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
357 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
358
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
359 # Creates a list to the arguments and store the bcftools call
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
360 subset_args = ['view']
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
361
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
362 # Assign the output format arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
363 output_format_args = return_output_format_args(output_format)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
364
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
365 # Store the output format arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
366 subset_args.extend(output_format_args)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
367
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
368 # Stores the specified output filename
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
369 vcf_output = '%s.%s' % (output_prefix, output_format)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
370
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
371 # Assigns the output file to the arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
372 subset_args.extend(['-o', vcf_output])
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
373
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
374 # Holds the subset argument
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
375 chr_subet_arg = chromosome
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
376
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
377 # Check if either bp position arguments were specified
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
378 if from_bp or to_bp:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
379
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
380 # List of the position arguments, in their required order
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
381 position_args = [':', from_bp, '-', to_bp]
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
382
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
383 # Filter the position arguments to remove empty values
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
384 filttered_position_args = filter(None, position_args)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
385
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
386 # Map the arguments to str and add them to the chromosome argument
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
387 chr_subet_arg += ''.join(map(str, filttered_position_args))
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
388
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
389 # Checks if the input file has an index, then subset to the arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
390 if check_for_index(filename):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
391 # Subsets using the index
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
392 subset_args.extend(['-r', chr_subet_arg])
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
393 else:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
394 # Subsets using the stdout
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
395 subset_args.extend(['-t', chr_subet_arg])
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
396
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
397 # Assigns the input file to the arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
398 subset_args.append(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
399
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
400 # Call bcftools
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
401 call_bcftools(subset_args)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
402
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
403 def concatenate (filenames, output_prefix, output_format, keep_original = False):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
404 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
405 Concatenate multiple VCF-formatted files
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
406
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
407 This function will concatenate multiple VCF-formatted files into a
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
408 single VCF-formatted file of the specifed format.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
409
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
410 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
411 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
412 filenames : list
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
413 List of VCF-formatted input filenames
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
414 output_prefix : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
415 Prefix of the VCF-formatted output (i.e. without file extension)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
416 output_format : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
417 The format of the output (e.g. vcf, bcf, vcf.gz)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
418 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
419
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
420 # Holds the arguments to convert to VCF format
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
421 concat_args = ['concat']
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
422
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
423 # Assign the output format arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
424 output_format_args = return_output_format_args(output_format)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
425
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
426 # Store the output format arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
427 concat_args.extend(output_format_args)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
428
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
429 # Stores the specified output filename
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
430 vcf_output = '%s.%s' % (output_prefix, output_format)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
431
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
432 # Assigns the output file to the arguments
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
433 concat_args.extend(['-o', vcf_output])
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
434
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
435 # Assigns the input files to merge
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
436 concat_args.extend(filenames)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
437
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
438 # Call bcftools
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
439 call_bcftools(concat_args)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
440
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
441 # Delete the original files once the merged file is created
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
442 if not keep_original:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
443 for filename in filenames:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
444 if check_for_index(filename) == True:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
445 delete_index(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
446 os.remove(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
447
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
448 def convert_to_bcf (filename, output_prefix, keep_original = False):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
449 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
450 Converts a VCF-formatted file to BCF
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
451
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
452 This function will convert a VCF-formatted file to BCF with the
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
453 specified filename prefix. The function also has the option to keep or
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
454 delete the input file once the BCF file has been created.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
455
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
456 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
457 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
458 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
459 Filename of VCF-formatted input
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
460 output_prefix : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
461 Prefix of the BCF output (i.e. without file extension)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
462 keep_original : bool, optional
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
463 If the input file should be kept once converted
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
464 '''
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
465
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
466 # Holds the arguments to convert to BCF format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
467 convert_args = ['convert', '-O', 'b']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
468
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
469 # Stores the specified output_prefix to the BCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
470 bcf_output = '%s.bcf' % output_prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
471
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
472 # Assigns the output file to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
473 convert_args.extend(['-o', bcf_output])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
474
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
475 # Assigns the specified input to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
476 convert_args.append(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
477
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
478 # Call bcftools
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
479 call_bcftools(convert_args)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
480
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
481 # Delete the original file once the bcf file is created
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
482 if not keep_original:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
483 if check_for_index(filename) == True:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
484 delete_index(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
485 os.remove(filename)
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
486
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
487 def convert_to_vcf (filename, output_prefix, keep_original = False):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
488 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
489 Converts a VCF-formatted file to VCF
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
490
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
491 This function will convert a VCF-formatted file to VCF with the
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
492 specified filename prefix. The function also has the option to keep or
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
493 delete the input file once the VCF file has been created.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
494
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
495 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
496 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
497 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
498 Filename of VCF-formatted input
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
499 output_prefix : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
500 Prefix of the VCF output (i.e. without file extension)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
501 keep_original : bool, optional
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
502 If the input file should be kept once converted
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
503 '''
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
504
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
505 # Holds the arguments to convert to VCF format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
506 convert_args = ['view', '-O', 'v']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
507
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
508 # Stores the specified output_prefix to the VCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
509 vcf_output = '%s.vcf' % output_prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
510
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
511 # Assigns the output file to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
512 convert_args.extend(['-o', vcf_output])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
513
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
514 # Assigns the specified input to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
515 convert_args.append(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
516
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
517 # Call bcftools
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
518 call_bcftools(convert_args)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
519
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
520 # Delete the original file once the vcf file is created
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
521 if not keep_original:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
522 if check_for_index(filename) == True:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
523 delete_index(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
524 os.remove(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
525
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
526 def convert_to_vcfgz (filename, output_prefix, keep_original = False):
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
527 '''
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
528 Converts a VCF-formatted file to bgzipped-VCF
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
529
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
530 This function will convert a VCF-formatted file to bgzipped-VCF with the
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
531 specified filename prefix. The function also has the option to keep or
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
532 delete the input file once the bgzipped-VCF file has been created.
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
533
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
534 Parameters
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
535 ----------
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
536 filename : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
537 Filename of VCF-formatted input
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
538 output_prefix : str
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
539 Prefix of the bgzipped-VCF output (i.e. without file extension)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
540 keep_original : bool, optional
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
541 If the input file should be kept once converted
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
542 '''
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
543
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
544 # Holds the arguments to convert to VCFGZ format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
545 convert_args = ['view', '-O', 'z']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
546
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
547 # Stores the specified output_prefix to the VCFGZ file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
548 vcfgz_output = '%s.vcf.gz' % output_prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
549
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
550 # Assigns the output file to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
551 convert_args.extend(['-o', vcfgz_output])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
552
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
553 # Assigns the specified input to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
554 convert_args.append(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
555
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
556 # Call bcftools
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
557 call_bcftools(convert_args)
5
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
558
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
559 # Delete the original file once the vcfgz file is created
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
560 if not keep_original:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
561 if check_for_index(filename) == True:
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
562 delete_index(filename)
86a9d8d5b291 Uploaded
jaredgk
parents: 0
diff changeset
563 os.remove(filename)