comparison bcftools.py @ 5:86a9d8d5b291 draft default tip

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:34:34 -0400
parents 3830d29fca6a
children
comparison
equal deleted inserted replaced
4:901857c9b24f 5:86a9d8d5b291
5 5
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) 6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
7 7
8 from vcf_reader_func import checkFormat 8 from vcf_reader_func import checkFormat
9 9
10 def return_output_format_args (output_format):
11 '''
12 Return bcftools arguments for output format
13
14 Parameters
15 ----------
16 output_format : str
17 The specified output format
18
19 Raises
20 ------
21 Exception
22 If output format is unsupported by bcftools
23 '''
24
25 # Return the output format arguments
26 if output_format == 'vcf':
27 return ['-O', 'v']
28 elif output_format == 'bcf':
29 return ['-O', 'b']
30 elif output_format == 'vcf.gz':
31 return ['-O', 'z']
32 else:
33 raise Exception('Unsupported file format')
34
10 def check_bcftools_for_errors (bcftools_stderr): 35 def check_bcftools_for_errors (bcftools_stderr):
11 ''' 36 '''
12 Checks the bgzip stderr for errors 37 Checks the bgzip stderr for errors
13 38
14 Parameters 39 Parameters
16 bcftools_stderr : str 41 bcftools_stderr : str
17 bcftools stderr 42 bcftools stderr
18 43
19 Raises 44 Raises
20 ------ 45 ------
21 IOError 46 Exception
22 If bcftools stderr returns an error 47 If bcftools stderr returns an error
23 ''' 48 '''
24 49
25 # Expand as errors are discovered 50 # Expand as errors are discovered
26 if bcftools_stderr: 51
27 logging.error(vcftools_stderr) 52 # Log warning messages
28 raise Exception(vcftools_stderr) 53 if 'W::' in bcftools_stderr:
29 54 logging.warning(bcftools_stderr.strip())
30 def call_bcftools (bcftools_call_args): 55
56 # Report errors that are not warnings
57 elif bcftools_stderr:
58 raise Exception(bcftools_stderr)
59
60 def pipe_bcftools (bcftools_call_args):
61 '''
62 Calls bcftools with pipe output
63
64 The output of this function is the stdout and stderr of bcftools. This
65 function should only be used if bcftools is being used as the stdin of
66 another function. Please note that this function does not check the for
67 errors in the bcftools call. Please check for errors after the call is
68 closed using check_bcftools_for_errors.
69
70 Parameters
71 ----------
72 bcftools_stderr : str
73 bcftools stderr
74
75 Returns
76 -------
77 bcftools_call : PIPE
78 Pipe of subprocess call, including both stdout and stderr
79
80 '''
31 81
32 # bcftools subprocess call 82 # bcftools subprocess call
33 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) 83 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
34 84
85 return bcftools_call
86
87 def pipe_bcftools_to_chr (vcf_filename):
88 '''
89 Pipes chromosome and/or contig output of bcftools to a list of unique
90 entries
91
92 The purpose of this function is to return a list of the unique
93 chromosomes and/or contigs for use in other functions.
94
95 Parameters
96 ----------
97 vcf_filename : str
98 VCF input
99
100 Returns
101 -------
102 chromosomes_to_return : list
103 Unique chromosomes and/or contigs within VCF input
104 '''
105
106 # Open bcftools pipe
107 bcftools_call = pipe_bcftools(['query', '-f', '%CHROM\n', vcf_filename])
108
109 # Create a set to hold unique chromosome
110 chromosomes_to_return = set()
111
112 try:
113
114 # Current chromosomes/contigs, reduces duplicates if VCF is sorted
115 previous_chr = None
116
117 # Iterate the bcftools stdout unless error occurs
118 for bcftools_stdout_line in iter(bcftools_call.stdout.readline, b''):
119 # Remove the newline character
120 bcftools_line_chr = bcftools_stdout_line.strip()
121 # Check if the bcftools bcftools chr is different from stored chr
122 if bcftools_line_chr != previous_chr:
123 # Store the new chr for comparisons to reduce duplicates
124 previous_chr = bcftools_line_chr
125 # Save the chr
126 chromosomes_to_return.add(bcftools_line_chr)
127
128 except:
129 raise Exception('bcftools call error')
130
131 # Close the bcftools stdout
132 bcftools_call.stdout.close()
133
134 # Wait for bctools to finish
135 bcftools_call.wait()
136
137 # Read the bcftools stderr
138 bcftools_stderr = bcftools_call.stderr.read()
139
140 # Check if code is running in python 3
141 if sys.version_info[0] == 3:
142 # Convert bytes to string
143 bcftools_stderr = bcftools_stderr.decode()
144
145 # Check that the log file was created correctly
146 check_bcftools_for_errors(bcftools_stderr)
147
148 logging.info('bcftools call complete')
149
150 return list(chromosomes_to_return)
151
152 def call_bcftools (bcftools_call_args):
153 '''
154 Calls bcftools
155
156 The function calls bcftools.
157
158 Parameters
159 ----------
160 bcftools_call_args : list
161 bcftools arguments
162
163 Returns
164 -------
165 vcftools_err : str
166 vcftools log output
167
168 Raises
169 ------
170 Exception
171 If bcftools stderr returns an error
172 '''
173
174 # bcftools subprocess call
175 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
176
35 # Wait for bcftools to finish 177 # Wait for bcftools to finish
36 bcftools_out, bcftools_err = bcftools_call.communicate() 178 bcftools_stdout, bcftools_stderr = bcftools_call.communicate()
37 179
38 check_bcftools_for_errors(bcftools_err) 180 # Check if code is running in python 3
181 if sys.version_info[0] == 3:
182 # Convert bytes to string
183 bcftools_stderr = bcftools_stderr.decode()
184
185 check_bcftools_for_errors(bcftools_stderr)
39 186
40 logging.info('bcftools call complete') 187 logging.info('bcftools call complete')
41 188
189 return bcftools_stderr
190
42 def check_for_index (filename): 191 def check_for_index (filename):
192 '''
193 Checks for index file
194
195 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) the
196 function will return either True (i.e. index found) or False. However,
197 if the file is a VCF the function will return None (as VCF files cannot
198 have an index). An error is returned if the file is either a
199 gzipped-VCF file or not a VCF-based format.
200
201 Parameters
202 ----------
203 filename : str
204 Filename of VCF-formatted file
205
206 Returns
207 -------
208 bool, None
209 Returns bool for VCF.GZ and BCF files. Returns None for VCF files
210
211 Raises
212 ------
213 Exception
214 If the file is a gzipped-VCF or of an unknown format
215 '''
43 216
44 # Assign the file format 217 # Assign the file format
45 file_format = checkFormat(filename) 218 file_format = checkFormat(filename)
46 219
47 # Check if the file to be indexed is a vcf.gz 220 # Check if the file to be indexed is a vcf.gz
54 elif file_format == 'bcf': 227 elif file_format == 'bcf':
55 # Check if the index (.csi) exists 228 # Check if the index (.csi) exists
56 if os.path.isfile(filename + '.csi'): 229 if os.path.isfile(filename + '.csi'):
57 return True 230 return True
58 231
232 # Check if the file is vcf (does not need an index)
233 elif file_format == 'vcf':
234 return None
235
236 # Check if the file is gzip-compressed vcf (cannot have an index)
237 elif file_format == 'gzip':
238 raise Exception('GZIP-compressed VCF files do not support index files.')
239
240 # Check if the file is an unknown format
241 else:
242 raise Exception('Unknown file format')
243
59 # Return false if no index is found 244 # Return false if no index is found
60 return False 245 return False
61 246
247 def delete_index (filename):
248 '''
249 Deletes an index file
250
251 If the file is capable of having an index (i.e. bgzipped-VCF or BCF)
252 this function will delete the index. However, if the file is either a
253 VCF or a gzip-compressed VCF the function will return an error. The
254 function also results in an error if the index cannot be found. This
255 function should be used following check_for_index.
256
257 Parameters
258 ----------
259 filename : str
260 Filename of VCF-formatted file
261
262 Raises
263 ------
264 Exception
265 No index file could be found
266 Exception
267 If the file is a gzipped-VCF or a VCF
268 '''
269
270 # Assign the file format
271 file_format = checkFormat(filename)
272
273 # Check if the file to be indexed is a vcf.gz
274 if file_format == 'bgzip':
275 # Check if the index (.tbi) exists
276 if os.path.isfile(filename + '.tbi'):
277 # Delete the index
278 os.remove(filename + '.tbi')
279 return
280
281 # Check if the file to be indexed is a bcf
282 elif file_format == 'bcf':
283 # Check if the index (.csi) exists
284 if os.path.isfile(filename + '.csi'):
285 # Delete the index
286 os.remove(filename + '.csi')
287 return
288
289 # Check if the file is vcf (cannot have an index)
290 elif file_format == 'vcf':
291 raise Exception('VCF format does not support index files.')
292
293 # Check if the file is gzip-compressed vcf (cannot have an index)
294 elif file_format == 'gzip':
295 raise Exception('GZIP-compressed VCF files do not support index files.')
296
297 # Return error if no index is found
298 raise Exception('No index file found.')
299
62 def create_index (filename): 300 def create_index (filename):
301 '''
302 Creates an index file
303
304 If the file is capable of having an index (i.e. bgzipped-VCF or BCF)
305 this function will create an index file. However, if the file is a
306 different format the function will return an error.
307
308 Parameters
309 ----------
310 filename : str
311 Filename of VCF-formatted file
312
313 Raises
314 ------
315 Exception
316 If the file is not a bgzipped-VCF or BCF
317 '''
63 318
64 # Assign the file format 319 # Assign the file format
65 file_format = checkFormat(filename) 320 file_format = checkFormat(filename)
66 321
67 # Check if the file to be indexed is a vcf.gz 322 # Check if the file to be indexed is a vcf.gz
76 331
77 # Report if file cannot be indexed 332 # Report if file cannot be indexed
78 else: 333 else:
79 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) 334 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename)
80 335
81 def convert_to_bcf (filename, output_prefix): 336 def chr_subset_file (filename, chromosome, output_prefix, output_format, from_bp = None, to_bp = None):
337 '''
338 Creates chromosome subset
339
340 This function is used to create a VCF-formatted subset with only
341 the data from a single chromosome.
342
343 Parameters
344 ----------
345 filename : str
346 Filename of VCF-formatted input
347 chromosome : str
348 Chromosome to subset
349 output_prefix : str
350 Prefix of the VCF-formatted output (i.e. without file extension)
351 output_format : str
352 The format of the output (e.g. vcf, bcf, vcf.gz)
353 from_bp : int, optional
354 Lower bound of sites to include
355 to_bp : int, optional
356 Upper bound of sites to include
357 '''
358
359 # Creates a list to the arguments and store the bcftools call
360 subset_args = ['view']
361
362 # Assign the output format arguments
363 output_format_args = return_output_format_args(output_format)
364
365 # Store the output format arguments
366 subset_args.extend(output_format_args)
367
368 # Stores the specified output filename
369 vcf_output = '%s.%s' % (output_prefix, output_format)
370
371 # Assigns the output file to the arguments
372 subset_args.extend(['-o', vcf_output])
373
374 # Holds the subset argument
375 chr_subet_arg = chromosome
376
377 # Check if either bp position arguments were specified
378 if from_bp or to_bp:
379
380 # List of the position arguments, in their required order
381 position_args = [':', from_bp, '-', to_bp]
382
383 # Filter the position arguments to remove empty values
384 filttered_position_args = filter(None, position_args)
385
386 # Map the arguments to str and add them to the chromosome argument
387 chr_subet_arg += ''.join(map(str, filttered_position_args))
388
389 # Checks if the input file has an index, then subset to the arguments
390 if check_for_index(filename):
391 # Subsets using the index
392 subset_args.extend(['-r', chr_subet_arg])
393 else:
394 # Subsets using the stdout
395 subset_args.extend(['-t', chr_subet_arg])
396
397 # Assigns the input file to the arguments
398 subset_args.append(filename)
399
400 # Call bcftools
401 call_bcftools(subset_args)
402
403 def concatenate (filenames, output_prefix, output_format, keep_original = False):
404 '''
405 Concatenate multiple VCF-formatted files
406
407 This function will concatenate multiple VCF-formatted files into a
408 single VCF-formatted file of the specifed format.
409
410 Parameters
411 ----------
412 filenames : list
413 List of VCF-formatted input filenames
414 output_prefix : str
415 Prefix of the VCF-formatted output (i.e. without file extension)
416 output_format : str
417 The format of the output (e.g. vcf, bcf, vcf.gz)
418 '''
419
420 # Holds the arguments to convert to VCF format
421 concat_args = ['concat']
422
423 # Assign the output format arguments
424 output_format_args = return_output_format_args(output_format)
425
426 # Store the output format arguments
427 concat_args.extend(output_format_args)
428
429 # Stores the specified output filename
430 vcf_output = '%s.%s' % (output_prefix, output_format)
431
432 # Assigns the output file to the arguments
433 concat_args.extend(['-o', vcf_output])
434
435 # Assigns the input files to merge
436 concat_args.extend(filenames)
437
438 # Call bcftools
439 call_bcftools(concat_args)
440
441 # Delete the original files once the merged file is created
442 if not keep_original:
443 for filename in filenames:
444 if check_for_index(filename) == True:
445 delete_index(filename)
446 os.remove(filename)
447
448 def convert_to_bcf (filename, output_prefix, keep_original = False):
449 '''
450 Converts a VCF-formatted file to BCF
451
452 This function will convert a VCF-formatted file to BCF with the
453 specified filename prefix. The function also has the option to keep or
454 delete the input file once the BCF file has been created.
455
456 Parameters
457 ----------
458 filename : str
459 Filename of VCF-formatted input
460 output_prefix : str
461 Prefix of the BCF output (i.e. without file extension)
462 keep_original : bool, optional
463 If the input file should be kept once converted
464 '''
82 465
83 # Holds the arguments to convert to BCF format 466 # Holds the arguments to convert to BCF format
84 convert_args = ['convert', '-O', 'b'] 467 convert_args = ['convert', '-O', 'b']
85 468
86 # Stores the specified output_prefix to the BCF file 469 # Stores the specified output_prefix to the BCF file
93 convert_args.append(filename) 476 convert_args.append(filename)
94 477
95 # Call bcftools 478 # Call bcftools
96 call_bcftools(convert_args) 479 call_bcftools(convert_args)
97 480
98 481 # Delete the original file once the bcf file is created
99 def convert_to_vcf (filename, output_prefix): 482 if not keep_original:
483 if check_for_index(filename) == True:
484 delete_index(filename)
485 os.remove(filename)
486
487 def convert_to_vcf (filename, output_prefix, keep_original = False):
488 '''
489 Converts a VCF-formatted file to VCF
490
491 This function will convert a VCF-formatted file to VCF with the
492 specified filename prefix. The function also has the option to keep or
493 delete the input file once the VCF file has been created.
494
495 Parameters
496 ----------
497 filename : str
498 Filename of VCF-formatted input
499 output_prefix : str
500 Prefix of the VCF output (i.e. without file extension)
501 keep_original : bool, optional
502 If the input file should be kept once converted
503 '''
100 504
101 # Holds the arguments to convert to VCF format 505 # Holds the arguments to convert to VCF format
102 convert_args = ['view', '-O', 'v'] 506 convert_args = ['view', '-O', 'v']
103 507
104 # Stores the specified output_prefix to the VCF file 508 # Stores the specified output_prefix to the VCF file
111 convert_args.append(filename) 515 convert_args.append(filename)
112 516
113 # Call bcftools 517 # Call bcftools
114 call_bcftools(convert_args) 518 call_bcftools(convert_args)
115 519
116 def convert_to_vcfgz (filename, output_prefix): 520 # Delete the original file once the vcf file is created
521 if not keep_original:
522 if check_for_index(filename) == True:
523 delete_index(filename)
524 os.remove(filename)
525
526 def convert_to_vcfgz (filename, output_prefix, keep_original = False):
527 '''
528 Converts a VCF-formatted file to bgzipped-VCF
529
530 This function will convert a VCF-formatted file to bgzipped-VCF with the
531 specified filename prefix. The function also has the option to keep or
532 delete the input file once the bgzipped-VCF file has been created.
533
534 Parameters
535 ----------
536 filename : str
537 Filename of VCF-formatted input
538 output_prefix : str
539 Prefix of the bgzipped-VCF output (i.e. without file extension)
540 keep_original : bool, optional
541 If the input file should be kept once converted
542 '''
117 543
118 # Holds the arguments to convert to VCFGZ format 544 # Holds the arguments to convert to VCFGZ format
119 convert_args = ['view', '-O', 'z'] 545 convert_args = ['view', '-O', 'z']
120 546
121 # Stores the specified output_prefix to the VCFGZ file 547 # Stores the specified output_prefix to the VCFGZ file
127 # Assigns the specified input to the arguments 553 # Assigns the specified input to the arguments
128 convert_args.append(filename) 554 convert_args.append(filename)
129 555
130 # Call bcftools 556 # Call bcftools
131 call_bcftools(convert_args) 557 call_bcftools(convert_args)
558
559 # Delete the original file once the vcfgz file is created
560 if not keep_original:
561 if check_for_index(filename) == True:
562 delete_index(filename)
563 os.remove(filename)