0
|
1 import os
|
|
2 import sys
|
|
3 import logging
|
|
4 import subprocess
|
|
5
|
|
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
|
|
7
|
|
8 from vcf_reader_func import checkFormat
|
|
9
|
5
|
10 def return_output_format_args (output_format):
|
|
11 '''
|
|
12 Return bcftools arguments for output format
|
|
13
|
|
14 Parameters
|
|
15 ----------
|
|
16 output_format : str
|
|
17 The specified output format
|
|
18
|
|
19 Raises
|
|
20 ------
|
|
21 Exception
|
|
22 If output format is unsupported by bcftools
|
|
23 '''
|
|
24
|
|
25 # Return the output format arguments
|
|
26 if output_format == 'vcf':
|
|
27 return ['-O', 'v']
|
|
28 elif output_format == 'bcf':
|
|
29 return ['-O', 'b']
|
|
30 elif output_format == 'vcf.gz':
|
|
31 return ['-O', 'z']
|
|
32 else:
|
|
33 raise Exception('Unsupported file format')
|
|
34
|
0
|
35 def check_bcftools_for_errors (bcftools_stderr):
|
|
36 '''
|
|
37 Checks the bgzip stderr for errors
|
|
38
|
|
39 Parameters
|
|
40 ----------
|
|
41 bcftools_stderr : str
|
|
42 bcftools stderr
|
|
43
|
|
44 Raises
|
|
45 ------
|
5
|
46 Exception
|
0
|
47 If bcftools stderr returns an error
|
|
48 '''
|
|
49
|
|
50 # Expand as errors are discovered
|
5
|
51
|
|
52 # Log warning messages
|
|
53 if 'W::' in bcftools_stderr:
|
|
54 logging.warning(bcftools_stderr.strip())
|
|
55
|
|
56 # Report errors that are not warnings
|
|
57 elif bcftools_stderr:
|
|
58 raise Exception(bcftools_stderr)
|
|
59
|
|
60 def pipe_bcftools (bcftools_call_args):
|
|
61 '''
|
|
62 Calls bcftools with pipe output
|
|
63
|
|
64 The output of this function is the stdout and stderr of bcftools. This
|
|
65 function should only be used if bcftools is being used as the stdin of
|
|
66 another function. Please note that this function does not check the for
|
|
67 errors in the bcftools call. Please check for errors after the call is
|
|
68 closed using check_bcftools_for_errors.
|
|
69
|
|
70 Parameters
|
|
71 ----------
|
|
72 bcftools_stderr : str
|
|
73 bcftools stderr
|
|
74
|
|
75 Returns
|
|
76 -------
|
|
77 bcftools_call : PIPE
|
|
78 Pipe of subprocess call, including both stdout and stderr
|
|
79
|
|
80 '''
|
|
81
|
|
82 # bcftools subprocess call
|
|
83 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
84
|
|
85 return bcftools_call
|
|
86
|
|
87 def pipe_bcftools_to_chr (vcf_filename):
|
|
88 '''
|
|
89 Pipes chromosome and/or contig output of bcftools to a list of unique
|
|
90 entries
|
|
91
|
|
92 The purpose of this function is to return a list of the unique
|
|
93 chromosomes and/or contigs for use in other functions.
|
|
94
|
|
95 Parameters
|
|
96 ----------
|
|
97 vcf_filename : str
|
|
98 VCF input
|
|
99
|
|
100 Returns
|
|
101 -------
|
|
102 chromosomes_to_return : list
|
|
103 Unique chromosomes and/or contigs within VCF input
|
|
104 '''
|
|
105
|
|
106 # Open bcftools pipe
|
|
107 bcftools_call = pipe_bcftools(['query', '-f', '%CHROM\n', vcf_filename])
|
|
108
|
|
109 # Create a set to hold unique chromosome
|
|
110 chromosomes_to_return = set()
|
|
111
|
|
112 try:
|
|
113
|
|
114 # Current chromosomes/contigs, reduces duplicates if VCF is sorted
|
|
115 previous_chr = None
|
|
116
|
|
117 # Iterate the bcftools stdout unless error occurs
|
|
118 for bcftools_stdout_line in iter(bcftools_call.stdout.readline, b''):
|
|
119 # Remove the newline character
|
|
120 bcftools_line_chr = bcftools_stdout_line.strip()
|
|
121 # Check if the bcftools bcftools chr is different from stored chr
|
|
122 if bcftools_line_chr != previous_chr:
|
|
123 # Store the new chr for comparisons to reduce duplicates
|
|
124 previous_chr = bcftools_line_chr
|
|
125 # Save the chr
|
|
126 chromosomes_to_return.add(bcftools_line_chr)
|
|
127
|
|
128 except:
|
|
129 raise Exception('bcftools call error')
|
|
130
|
|
131 # Close the bcftools stdout
|
|
132 bcftools_call.stdout.close()
|
|
133
|
|
134 # Wait for bctools to finish
|
|
135 bcftools_call.wait()
|
|
136
|
|
137 # Read the bcftools stderr
|
|
138 bcftools_stderr = bcftools_call.stderr.read()
|
|
139
|
|
140 # Check if code is running in python 3
|
|
141 if sys.version_info[0] == 3:
|
|
142 # Convert bytes to string
|
|
143 bcftools_stderr = bcftools_stderr.decode()
|
|
144
|
|
145 # Check that the log file was created correctly
|
|
146 check_bcftools_for_errors(bcftools_stderr)
|
|
147
|
|
148 logging.info('bcftools call complete')
|
|
149
|
|
150 return list(chromosomes_to_return)
|
0
|
151
|
|
152 def call_bcftools (bcftools_call_args):
|
5
|
153 '''
|
|
154 Calls bcftools
|
|
155
|
|
156 The function calls bcftools.
|
|
157
|
|
158 Parameters
|
|
159 ----------
|
|
160 bcftools_call_args : list
|
|
161 bcftools arguments
|
|
162
|
|
163 Returns
|
|
164 -------
|
|
165 vcftools_err : str
|
|
166 vcftools log output
|
|
167
|
|
168 Raises
|
|
169 ------
|
|
170 Exception
|
|
171 If bcftools stderr returns an error
|
|
172 '''
|
0
|
173
|
|
174 # bcftools subprocess call
|
|
175 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
176
|
|
177 # Wait for bcftools to finish
|
5
|
178 bcftools_stdout, bcftools_stderr = bcftools_call.communicate()
|
0
|
179
|
5
|
180 # Check if code is running in python 3
|
|
181 if sys.version_info[0] == 3:
|
|
182 # Convert bytes to string
|
|
183 bcftools_stderr = bcftools_stderr.decode()
|
|
184
|
|
185 check_bcftools_for_errors(bcftools_stderr)
|
0
|
186
|
|
187 logging.info('bcftools call complete')
|
|
188
|
5
|
189 return bcftools_stderr
|
|
190
|
0
|
191 def check_for_index (filename):
|
5
|
192 '''
|
|
193 Checks for index file
|
|
194
|
|
195 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) the
|
|
196 function will return either True (i.e. index found) or False. However,
|
|
197 if the file is a VCF the function will return None (as VCF files cannot
|
|
198 have an index). An error is returned if the file is either a
|
|
199 gzipped-VCF file or not a VCF-based format.
|
|
200
|
|
201 Parameters
|
|
202 ----------
|
|
203 filename : str
|
|
204 Filename of VCF-formatted file
|
|
205
|
|
206 Returns
|
|
207 -------
|
|
208 bool, None
|
|
209 Returns bool for VCF.GZ and BCF files. Returns None for VCF files
|
|
210
|
|
211 Raises
|
|
212 ------
|
|
213 Exception
|
|
214 If the file is a gzipped-VCF or of an unknown format
|
|
215 '''
|
0
|
216
|
|
217 # Assign the file format
|
|
218 file_format = checkFormat(filename)
|
|
219
|
|
220 # Check if the file to be indexed is a vcf.gz
|
|
221 if file_format == 'bgzip':
|
|
222 # Check if the index (.tbi) exists
|
|
223 if os.path.isfile(filename + '.tbi'):
|
|
224 return True
|
|
225
|
|
226 # Check if the file to be indexed is a bcf
|
|
227 elif file_format == 'bcf':
|
|
228 # Check if the index (.csi) exists
|
|
229 if os.path.isfile(filename + '.csi'):
|
|
230 return True
|
|
231
|
5
|
232 # Check if the file is vcf (does not need an index)
|
|
233 elif file_format == 'vcf':
|
|
234 return None
|
|
235
|
|
236 # Check if the file is gzip-compressed vcf (cannot have an index)
|
|
237 elif file_format == 'gzip':
|
|
238 raise Exception('GZIP-compressed VCF files do not support index files.')
|
|
239
|
|
240 # Check if the file is an unknown format
|
|
241 else:
|
|
242 raise Exception('Unknown file format')
|
|
243
|
0
|
244 # Return false if no index is found
|
|
245 return False
|
|
246
|
5
|
247 def delete_index (filename):
|
|
248 '''
|
|
249 Deletes an index file
|
|
250
|
|
251 If the file is capable of having an index (i.e. bgzipped-VCF or BCF)
|
|
252 this function will delete the index. However, if the file is either a
|
|
253 VCF or a gzip-compressed VCF the function will return an error. The
|
|
254 function also results in an error if the index cannot be found. This
|
|
255 function should be used following check_for_index.
|
|
256
|
|
257 Parameters
|
|
258 ----------
|
|
259 filename : str
|
|
260 Filename of VCF-formatted file
|
|
261
|
|
262 Raises
|
|
263 ------
|
|
264 Exception
|
|
265 No index file could be found
|
|
266 Exception
|
|
267 If the file is a gzipped-VCF or a VCF
|
|
268 '''
|
|
269
|
|
270 # Assign the file format
|
|
271 file_format = checkFormat(filename)
|
|
272
|
|
273 # Check if the file to be indexed is a vcf.gz
|
|
274 if file_format == 'bgzip':
|
|
275 # Check if the index (.tbi) exists
|
|
276 if os.path.isfile(filename + '.tbi'):
|
|
277 # Delete the index
|
|
278 os.remove(filename + '.tbi')
|
|
279 return
|
|
280
|
|
281 # Check if the file to be indexed is a bcf
|
|
282 elif file_format == 'bcf':
|
|
283 # Check if the index (.csi) exists
|
|
284 if os.path.isfile(filename + '.csi'):
|
|
285 # Delete the index
|
|
286 os.remove(filename + '.csi')
|
|
287 return
|
|
288
|
|
289 # Check if the file is vcf (cannot have an index)
|
|
290 elif file_format == 'vcf':
|
|
291 raise Exception('VCF format does not support index files.')
|
|
292
|
|
293 # Check if the file is gzip-compressed vcf (cannot have an index)
|
|
294 elif file_format == 'gzip':
|
|
295 raise Exception('GZIP-compressed VCF files do not support index files.')
|
|
296
|
|
297 # Return error if no index is found
|
|
298 raise Exception('No index file found.')
|
|
299
|
0
|
300 def create_index (filename):
|
5
|
301 '''
|
|
302 Creates an index file
|
|
303
|
|
304 If the file is capable of having an index (i.e. bgzipped-VCF or BCF)
|
|
305 this function will create an index file. However, if the file is a
|
|
306 different format the function will return an error.
|
|
307
|
|
308 Parameters
|
|
309 ----------
|
|
310 filename : str
|
|
311 Filename of VCF-formatted file
|
|
312
|
|
313 Raises
|
|
314 ------
|
|
315 Exception
|
|
316 If the file is not a bgzipped-VCF or BCF
|
|
317 '''
|
0
|
318
|
|
319 # Assign the file format
|
|
320 file_format = checkFormat(filename)
|
|
321
|
|
322 # Check if the file to be indexed is a vcf.gz
|
|
323 if file_format == 'bgzip':
|
|
324 # Create a index (.tbi)
|
|
325 call_bcftools(['index', '-t', filename])
|
|
326
|
|
327 # Check if the file to be indexed is a bcf
|
|
328 elif file_format == 'bcf':
|
|
329 # Create a index (.csi)
|
|
330 call_bcftools(['index', '-c', filename])
|
|
331
|
|
332 # Report if file cannot be indexed
|
|
333 else:
|
|
334 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename)
|
|
335
|
5
|
336 def chr_subset_file (filename, chromosome, output_prefix, output_format, from_bp = None, to_bp = None):
|
|
337 '''
|
|
338 Creates chromosome subset
|
|
339
|
|
340 This function is used to create a VCF-formatted subset with only
|
|
341 the data from a single chromosome.
|
|
342
|
|
343 Parameters
|
|
344 ----------
|
|
345 filename : str
|
|
346 Filename of VCF-formatted input
|
|
347 chromosome : str
|
|
348 Chromosome to subset
|
|
349 output_prefix : str
|
|
350 Prefix of the VCF-formatted output (i.e. without file extension)
|
|
351 output_format : str
|
|
352 The format of the output (e.g. vcf, bcf, vcf.gz)
|
|
353 from_bp : int, optional
|
|
354 Lower bound of sites to include
|
|
355 to_bp : int, optional
|
|
356 Upper bound of sites to include
|
|
357 '''
|
|
358
|
|
359 # Creates a list to the arguments and store the bcftools call
|
|
360 subset_args = ['view']
|
|
361
|
|
362 # Assign the output format arguments
|
|
363 output_format_args = return_output_format_args(output_format)
|
|
364
|
|
365 # Store the output format arguments
|
|
366 subset_args.extend(output_format_args)
|
|
367
|
|
368 # Stores the specified output filename
|
|
369 vcf_output = '%s.%s' % (output_prefix, output_format)
|
|
370
|
|
371 # Assigns the output file to the arguments
|
|
372 subset_args.extend(['-o', vcf_output])
|
|
373
|
|
374 # Holds the subset argument
|
|
375 chr_subet_arg = chromosome
|
|
376
|
|
377 # Check if either bp position arguments were specified
|
|
378 if from_bp or to_bp:
|
|
379
|
|
380 # List of the position arguments, in their required order
|
|
381 position_args = [':', from_bp, '-', to_bp]
|
|
382
|
|
383 # Filter the position arguments to remove empty values
|
|
384 filttered_position_args = filter(None, position_args)
|
|
385
|
|
386 # Map the arguments to str and add them to the chromosome argument
|
|
387 chr_subet_arg += ''.join(map(str, filttered_position_args))
|
|
388
|
|
389 # Checks if the input file has an index, then subset to the arguments
|
|
390 if check_for_index(filename):
|
|
391 # Subsets using the index
|
|
392 subset_args.extend(['-r', chr_subet_arg])
|
|
393 else:
|
|
394 # Subsets using the stdout
|
|
395 subset_args.extend(['-t', chr_subet_arg])
|
|
396
|
|
397 # Assigns the input file to the arguments
|
|
398 subset_args.append(filename)
|
|
399
|
|
400 # Call bcftools
|
|
401 call_bcftools(subset_args)
|
|
402
|
|
403 def concatenate (filenames, output_prefix, output_format, keep_original = False):
|
|
404 '''
|
|
405 Concatenate multiple VCF-formatted files
|
|
406
|
|
407 This function will concatenate multiple VCF-formatted files into a
|
|
408 single VCF-formatted file of the specifed format.
|
|
409
|
|
410 Parameters
|
|
411 ----------
|
|
412 filenames : list
|
|
413 List of VCF-formatted input filenames
|
|
414 output_prefix : str
|
|
415 Prefix of the VCF-formatted output (i.e. without file extension)
|
|
416 output_format : str
|
|
417 The format of the output (e.g. vcf, bcf, vcf.gz)
|
|
418 '''
|
|
419
|
|
420 # Holds the arguments to convert to VCF format
|
|
421 concat_args = ['concat']
|
|
422
|
|
423 # Assign the output format arguments
|
|
424 output_format_args = return_output_format_args(output_format)
|
|
425
|
|
426 # Store the output format arguments
|
|
427 concat_args.extend(output_format_args)
|
|
428
|
|
429 # Stores the specified output filename
|
|
430 vcf_output = '%s.%s' % (output_prefix, output_format)
|
|
431
|
|
432 # Assigns the output file to the arguments
|
|
433 concat_args.extend(['-o', vcf_output])
|
|
434
|
|
435 # Assigns the input files to merge
|
|
436 concat_args.extend(filenames)
|
|
437
|
|
438 # Call bcftools
|
|
439 call_bcftools(concat_args)
|
|
440
|
|
441 # Delete the original files once the merged file is created
|
|
442 if not keep_original:
|
|
443 for filename in filenames:
|
|
444 if check_for_index(filename) == True:
|
|
445 delete_index(filename)
|
|
446 os.remove(filename)
|
|
447
|
|
448 def convert_to_bcf (filename, output_prefix, keep_original = False):
|
|
449 '''
|
|
450 Converts a VCF-formatted file to BCF
|
|
451
|
|
452 This function will convert a VCF-formatted file to BCF with the
|
|
453 specified filename prefix. The function also has the option to keep or
|
|
454 delete the input file once the BCF file has been created.
|
|
455
|
|
456 Parameters
|
|
457 ----------
|
|
458 filename : str
|
|
459 Filename of VCF-formatted input
|
|
460 output_prefix : str
|
|
461 Prefix of the BCF output (i.e. without file extension)
|
|
462 keep_original : bool, optional
|
|
463 If the input file should be kept once converted
|
|
464 '''
|
0
|
465
|
|
466 # Holds the arguments to convert to BCF format
|
|
467 convert_args = ['convert', '-O', 'b']
|
|
468
|
|
469 # Stores the specified output_prefix to the BCF file
|
|
470 bcf_output = '%s.bcf' % output_prefix
|
|
471
|
|
472 # Assigns the output file to the arguments
|
|
473 convert_args.extend(['-o', bcf_output])
|
|
474
|
|
475 # Assigns the specified input to the arguments
|
|
476 convert_args.append(filename)
|
|
477
|
|
478 # Call bcftools
|
|
479 call_bcftools(convert_args)
|
|
480
|
5
|
481 # Delete the original file once the bcf file is created
|
|
482 if not keep_original:
|
|
483 if check_for_index(filename) == True:
|
|
484 delete_index(filename)
|
|
485 os.remove(filename)
|
0
|
486
|
5
|
487 def convert_to_vcf (filename, output_prefix, keep_original = False):
|
|
488 '''
|
|
489 Converts a VCF-formatted file to VCF
|
|
490
|
|
491 This function will convert a VCF-formatted file to VCF with the
|
|
492 specified filename prefix. The function also has the option to keep or
|
|
493 delete the input file once the VCF file has been created.
|
|
494
|
|
495 Parameters
|
|
496 ----------
|
|
497 filename : str
|
|
498 Filename of VCF-formatted input
|
|
499 output_prefix : str
|
|
500 Prefix of the VCF output (i.e. without file extension)
|
|
501 keep_original : bool, optional
|
|
502 If the input file should be kept once converted
|
|
503 '''
|
0
|
504
|
|
505 # Holds the arguments to convert to VCF format
|
|
506 convert_args = ['view', '-O', 'v']
|
|
507
|
|
508 # Stores the specified output_prefix to the VCF file
|
|
509 vcf_output = '%s.vcf' % output_prefix
|
|
510
|
|
511 # Assigns the output file to the arguments
|
|
512 convert_args.extend(['-o', vcf_output])
|
|
513
|
|
514 # Assigns the specified input to the arguments
|
|
515 convert_args.append(filename)
|
|
516
|
|
517 # Call bcftools
|
|
518 call_bcftools(convert_args)
|
|
519
|
5
|
520 # Delete the original file once the vcf file is created
|
|
521 if not keep_original:
|
|
522 if check_for_index(filename) == True:
|
|
523 delete_index(filename)
|
|
524 os.remove(filename)
|
|
525
|
|
526 def convert_to_vcfgz (filename, output_prefix, keep_original = False):
|
|
527 '''
|
|
528 Converts a VCF-formatted file to bgzipped-VCF
|
|
529
|
|
530 This function will convert a VCF-formatted file to bgzipped-VCF with the
|
|
531 specified filename prefix. The function also has the option to keep or
|
|
532 delete the input file once the bgzipped-VCF file has been created.
|
|
533
|
|
534 Parameters
|
|
535 ----------
|
|
536 filename : str
|
|
537 Filename of VCF-formatted input
|
|
538 output_prefix : str
|
|
539 Prefix of the bgzipped-VCF output (i.e. without file extension)
|
|
540 keep_original : bool, optional
|
|
541 If the input file should be kept once converted
|
|
542 '''
|
0
|
543
|
|
544 # Holds the arguments to convert to VCFGZ format
|
|
545 convert_args = ['view', '-O', 'z']
|
|
546
|
|
547 # Stores the specified output_prefix to the VCFGZ file
|
|
548 vcfgz_output = '%s.vcf.gz' % output_prefix
|
|
549
|
|
550 # Assigns the output file to the arguments
|
|
551 convert_args.extend(['-o', vcfgz_output])
|
|
552
|
|
553 # Assigns the specified input to the arguments
|
|
554 convert_args.append(filename)
|
|
555
|
|
556 # Call bcftools
|
|
557 call_bcftools(convert_args)
|
5
|
558
|
|
559 # Delete the original file once the vcfgz file is created
|
|
560 if not keep_original:
|
|
561 if check_for_index(filename) == True:
|
|
562 delete_index(filename)
|
|
563 os.remove(filename)
|