Mercurial > repos > jaredgk > ppp_vcfphase
comparison bcftools.py @ 5:86a9d8d5b291 draft default tip
Uploaded
author | jaredgk |
---|---|
date | Wed, 17 Oct 2018 17:34:34 -0400 |
parents | 3830d29fca6a |
children |
comparison
equal
deleted
inserted
replaced
4:901857c9b24f | 5:86a9d8d5b291 |
---|---|
5 | 5 |
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) | 6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) |
7 | 7 |
8 from vcf_reader_func import checkFormat | 8 from vcf_reader_func import checkFormat |
9 | 9 |
10 def return_output_format_args (output_format): | |
11 ''' | |
12 Return bcftools arguments for output format | |
13 | |
14 Parameters | |
15 ---------- | |
16 output_format : str | |
17 The specified output format | |
18 | |
19 Raises | |
20 ------ | |
21 Exception | |
22 If output format is unsupported by bcftools | |
23 ''' | |
24 | |
25 # Return the output format arguments | |
26 if output_format == 'vcf': | |
27 return ['-O', 'v'] | |
28 elif output_format == 'bcf': | |
29 return ['-O', 'b'] | |
30 elif output_format == 'vcf.gz': | |
31 return ['-O', 'z'] | |
32 else: | |
33 raise Exception('Unsupported file format') | |
34 | |
10 def check_bcftools_for_errors (bcftools_stderr): | 35 def check_bcftools_for_errors (bcftools_stderr): |
11 ''' | 36 ''' |
12 Checks the bgzip stderr for errors | 37 Checks the bgzip stderr for errors |
13 | 38 |
14 Parameters | 39 Parameters |
16 bcftools_stderr : str | 41 bcftools_stderr : str |
17 bcftools stderr | 42 bcftools stderr |
18 | 43 |
19 Raises | 44 Raises |
20 ------ | 45 ------ |
21 IOError | 46 Exception |
22 If bcftools stderr returns an error | 47 If bcftools stderr returns an error |
23 ''' | 48 ''' |
24 | 49 |
25 # Expand as errors are discovered | 50 # Expand as errors are discovered |
26 if bcftools_stderr: | 51 |
27 logging.error(vcftools_stderr) | 52 # Log warning messages |
28 raise Exception(vcftools_stderr) | 53 if 'W::' in bcftools_stderr: |
29 | 54 logging.warning(bcftools_stderr.strip()) |
30 def call_bcftools (bcftools_call_args): | 55 |
56 # Report errors that are not warnings | |
57 elif bcftools_stderr: | |
58 raise Exception(bcftools_stderr) | |
59 | |
60 def pipe_bcftools (bcftools_call_args): | |
61 ''' | |
62 Calls bcftools with pipe output | |
63 | |
64 The output of this function is the stdout and stderr of bcftools. This | |
65 function should only be used if bcftools is being used as the stdin of | |
66 another function. Please note that this function does not check the for | |
67 errors in the bcftools call. Please check for errors after the call is | |
68 closed using check_bcftools_for_errors. | |
69 | |
70 Parameters | |
71 ---------- | |
72 bcftools_stderr : str | |
73 bcftools stderr | |
74 | |
75 Returns | |
76 ------- | |
77 bcftools_call : PIPE | |
78 Pipe of subprocess call, including both stdout and stderr | |
79 | |
80 ''' | |
31 | 81 |
32 # bcftools subprocess call | 82 # bcftools subprocess call |
33 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 83 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
34 | 84 |
85 return bcftools_call | |
86 | |
87 def pipe_bcftools_to_chr (vcf_filename): | |
88 ''' | |
89 Pipes chromosome and/or contig output of bcftools to a list of unique | |
90 entries | |
91 | |
92 The purpose of this function is to return a list of the unique | |
93 chromosomes and/or contigs for use in other functions. | |
94 | |
95 Parameters | |
96 ---------- | |
97 vcf_filename : str | |
98 VCF input | |
99 | |
100 Returns | |
101 ------- | |
102 chromosomes_to_return : list | |
103 Unique chromosomes and/or contigs within VCF input | |
104 ''' | |
105 | |
106 # Open bcftools pipe | |
107 bcftools_call = pipe_bcftools(['query', '-f', '%CHROM\n', vcf_filename]) | |
108 | |
109 # Create a set to hold unique chromosome | |
110 chromosomes_to_return = set() | |
111 | |
112 try: | |
113 | |
114 # Current chromosomes/contigs, reduces duplicates if VCF is sorted | |
115 previous_chr = None | |
116 | |
117 # Iterate the bcftools stdout unless error occurs | |
118 for bcftools_stdout_line in iter(bcftools_call.stdout.readline, b''): | |
119 # Remove the newline character | |
120 bcftools_line_chr = bcftools_stdout_line.strip() | |
121 # Check if the bcftools bcftools chr is different from stored chr | |
122 if bcftools_line_chr != previous_chr: | |
123 # Store the new chr for comparisons to reduce duplicates | |
124 previous_chr = bcftools_line_chr | |
125 # Save the chr | |
126 chromosomes_to_return.add(bcftools_line_chr) | |
127 | |
128 except: | |
129 raise Exception('bcftools call error') | |
130 | |
131 # Close the bcftools stdout | |
132 bcftools_call.stdout.close() | |
133 | |
134 # Wait for bctools to finish | |
135 bcftools_call.wait() | |
136 | |
137 # Read the bcftools stderr | |
138 bcftools_stderr = bcftools_call.stderr.read() | |
139 | |
140 # Check if code is running in python 3 | |
141 if sys.version_info[0] == 3: | |
142 # Convert bytes to string | |
143 bcftools_stderr = bcftools_stderr.decode() | |
144 | |
145 # Check that the log file was created correctly | |
146 check_bcftools_for_errors(bcftools_stderr) | |
147 | |
148 logging.info('bcftools call complete') | |
149 | |
150 return list(chromosomes_to_return) | |
151 | |
152 def call_bcftools (bcftools_call_args): | |
153 ''' | |
154 Calls bcftools | |
155 | |
156 The function calls bcftools. | |
157 | |
158 Parameters | |
159 ---------- | |
160 bcftools_call_args : list | |
161 bcftools arguments | |
162 | |
163 Returns | |
164 ------- | |
165 vcftools_err : str | |
166 vcftools log output | |
167 | |
168 Raises | |
169 ------ | |
170 Exception | |
171 If bcftools stderr returns an error | |
172 ''' | |
173 | |
174 # bcftools subprocess call | |
175 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
176 | |
35 # Wait for bcftools to finish | 177 # Wait for bcftools to finish |
36 bcftools_out, bcftools_err = bcftools_call.communicate() | 178 bcftools_stdout, bcftools_stderr = bcftools_call.communicate() |
37 | 179 |
38 check_bcftools_for_errors(bcftools_err) | 180 # Check if code is running in python 3 |
181 if sys.version_info[0] == 3: | |
182 # Convert bytes to string | |
183 bcftools_stderr = bcftools_stderr.decode() | |
184 | |
185 check_bcftools_for_errors(bcftools_stderr) | |
39 | 186 |
40 logging.info('bcftools call complete') | 187 logging.info('bcftools call complete') |
41 | 188 |
189 return bcftools_stderr | |
190 | |
42 def check_for_index (filename): | 191 def check_for_index (filename): |
192 ''' | |
193 Checks for index file | |
194 | |
195 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) the | |
196 function will return either True (i.e. index found) or False. However, | |
197 if the file is a VCF the function will return None (as VCF files cannot | |
198 have an index). An error is returned if the file is either a | |
199 gzipped-VCF file or not a VCF-based format. | |
200 | |
201 Parameters | |
202 ---------- | |
203 filename : str | |
204 Filename of VCF-formatted file | |
205 | |
206 Returns | |
207 ------- | |
208 bool, None | |
209 Returns bool for VCF.GZ and BCF files. Returns None for VCF files | |
210 | |
211 Raises | |
212 ------ | |
213 Exception | |
214 If the file is a gzipped-VCF or of an unknown format | |
215 ''' | |
43 | 216 |
44 # Assign the file format | 217 # Assign the file format |
45 file_format = checkFormat(filename) | 218 file_format = checkFormat(filename) |
46 | 219 |
47 # Check if the file to be indexed is a vcf.gz | 220 # Check if the file to be indexed is a vcf.gz |
54 elif file_format == 'bcf': | 227 elif file_format == 'bcf': |
55 # Check if the index (.csi) exists | 228 # Check if the index (.csi) exists |
56 if os.path.isfile(filename + '.csi'): | 229 if os.path.isfile(filename + '.csi'): |
57 return True | 230 return True |
58 | 231 |
232 # Check if the file is vcf (does not need an index) | |
233 elif file_format == 'vcf': | |
234 return None | |
235 | |
236 # Check if the file is gzip-compressed vcf (cannot have an index) | |
237 elif file_format == 'gzip': | |
238 raise Exception('GZIP-compressed VCF files do not support index files.') | |
239 | |
240 # Check if the file is an unknown format | |
241 else: | |
242 raise Exception('Unknown file format') | |
243 | |
59 # Return false if no index is found | 244 # Return false if no index is found |
60 return False | 245 return False |
61 | 246 |
247 def delete_index (filename): | |
248 ''' | |
249 Deletes an index file | |
250 | |
251 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) | |
252 this function will delete the index. However, if the file is either a | |
253 VCF or a gzip-compressed VCF the function will return an error. The | |
254 function also results in an error if the index cannot be found. This | |
255 function should be used following check_for_index. | |
256 | |
257 Parameters | |
258 ---------- | |
259 filename : str | |
260 Filename of VCF-formatted file | |
261 | |
262 Raises | |
263 ------ | |
264 Exception | |
265 No index file could be found | |
266 Exception | |
267 If the file is a gzipped-VCF or a VCF | |
268 ''' | |
269 | |
270 # Assign the file format | |
271 file_format = checkFormat(filename) | |
272 | |
273 # Check if the file to be indexed is a vcf.gz | |
274 if file_format == 'bgzip': | |
275 # Check if the index (.tbi) exists | |
276 if os.path.isfile(filename + '.tbi'): | |
277 # Delete the index | |
278 os.remove(filename + '.tbi') | |
279 return | |
280 | |
281 # Check if the file to be indexed is a bcf | |
282 elif file_format == 'bcf': | |
283 # Check if the index (.csi) exists | |
284 if os.path.isfile(filename + '.csi'): | |
285 # Delete the index | |
286 os.remove(filename + '.csi') | |
287 return | |
288 | |
289 # Check if the file is vcf (cannot have an index) | |
290 elif file_format == 'vcf': | |
291 raise Exception('VCF format does not support index files.') | |
292 | |
293 # Check if the file is gzip-compressed vcf (cannot have an index) | |
294 elif file_format == 'gzip': | |
295 raise Exception('GZIP-compressed VCF files do not support index files.') | |
296 | |
297 # Return error if no index is found | |
298 raise Exception('No index file found.') | |
299 | |
62 def create_index (filename): | 300 def create_index (filename): |
301 ''' | |
302 Creates an index file | |
303 | |
304 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) | |
305 this function will create an index file. However, if the file is a | |
306 different format the function will return an error. | |
307 | |
308 Parameters | |
309 ---------- | |
310 filename : str | |
311 Filename of VCF-formatted file | |
312 | |
313 Raises | |
314 ------ | |
315 Exception | |
316 If the file is not a bgzipped-VCF or BCF | |
317 ''' | |
63 | 318 |
64 # Assign the file format | 319 # Assign the file format |
65 file_format = checkFormat(filename) | 320 file_format = checkFormat(filename) |
66 | 321 |
67 # Check if the file to be indexed is a vcf.gz | 322 # Check if the file to be indexed is a vcf.gz |
76 | 331 |
77 # Report if file cannot be indexed | 332 # Report if file cannot be indexed |
78 else: | 333 else: |
79 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) | 334 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) |
80 | 335 |
81 def convert_to_bcf (filename, output_prefix): | 336 def chr_subset_file (filename, chromosome, output_prefix, output_format, from_bp = None, to_bp = None): |
337 ''' | |
338 Creates chromosome subset | |
339 | |
340 This function is used to create a VCF-formatted subset with only | |
341 the data from a single chromosome. | |
342 | |
343 Parameters | |
344 ---------- | |
345 filename : str | |
346 Filename of VCF-formatted input | |
347 chromosome : str | |
348 Chromosome to subset | |
349 output_prefix : str | |
350 Prefix of the VCF-formatted output (i.e. without file extension) | |
351 output_format : str | |
352 The format of the output (e.g. vcf, bcf, vcf.gz) | |
353 from_bp : int, optional | |
354 Lower bound of sites to include | |
355 to_bp : int, optional | |
356 Upper bound of sites to include | |
357 ''' | |
358 | |
359 # Creates a list to the arguments and store the bcftools call | |
360 subset_args = ['view'] | |
361 | |
362 # Assign the output format arguments | |
363 output_format_args = return_output_format_args(output_format) | |
364 | |
365 # Store the output format arguments | |
366 subset_args.extend(output_format_args) | |
367 | |
368 # Stores the specified output filename | |
369 vcf_output = '%s.%s' % (output_prefix, output_format) | |
370 | |
371 # Assigns the output file to the arguments | |
372 subset_args.extend(['-o', vcf_output]) | |
373 | |
374 # Holds the subset argument | |
375 chr_subet_arg = chromosome | |
376 | |
377 # Check if either bp position arguments were specified | |
378 if from_bp or to_bp: | |
379 | |
380 # List of the position arguments, in their required order | |
381 position_args = [':', from_bp, '-', to_bp] | |
382 | |
383 # Filter the position arguments to remove empty values | |
384 filttered_position_args = filter(None, position_args) | |
385 | |
386 # Map the arguments to str and add them to the chromosome argument | |
387 chr_subet_arg += ''.join(map(str, filttered_position_args)) | |
388 | |
389 # Checks if the input file has an index, then subset to the arguments | |
390 if check_for_index(filename): | |
391 # Subsets using the index | |
392 subset_args.extend(['-r', chr_subet_arg]) | |
393 else: | |
394 # Subsets using the stdout | |
395 subset_args.extend(['-t', chr_subet_arg]) | |
396 | |
397 # Assigns the input file to the arguments | |
398 subset_args.append(filename) | |
399 | |
400 # Call bcftools | |
401 call_bcftools(subset_args) | |
402 | |
403 def concatenate (filenames, output_prefix, output_format, keep_original = False): | |
404 ''' | |
405 Concatenate multiple VCF-formatted files | |
406 | |
407 This function will concatenate multiple VCF-formatted files into a | |
408 single VCF-formatted file of the specifed format. | |
409 | |
410 Parameters | |
411 ---------- | |
412 filenames : list | |
413 List of VCF-formatted input filenames | |
414 output_prefix : str | |
415 Prefix of the VCF-formatted output (i.e. without file extension) | |
416 output_format : str | |
417 The format of the output (e.g. vcf, bcf, vcf.gz) | |
418 ''' | |
419 | |
420 # Holds the arguments to convert to VCF format | |
421 concat_args = ['concat'] | |
422 | |
423 # Assign the output format arguments | |
424 output_format_args = return_output_format_args(output_format) | |
425 | |
426 # Store the output format arguments | |
427 concat_args.extend(output_format_args) | |
428 | |
429 # Stores the specified output filename | |
430 vcf_output = '%s.%s' % (output_prefix, output_format) | |
431 | |
432 # Assigns the output file to the arguments | |
433 concat_args.extend(['-o', vcf_output]) | |
434 | |
435 # Assigns the input files to merge | |
436 concat_args.extend(filenames) | |
437 | |
438 # Call bcftools | |
439 call_bcftools(concat_args) | |
440 | |
441 # Delete the original files once the merged file is created | |
442 if not keep_original: | |
443 for filename in filenames: | |
444 if check_for_index(filename) == True: | |
445 delete_index(filename) | |
446 os.remove(filename) | |
447 | |
448 def convert_to_bcf (filename, output_prefix, keep_original = False): | |
449 ''' | |
450 Converts a VCF-formatted file to BCF | |
451 | |
452 This function will convert a VCF-formatted file to BCF with the | |
453 specified filename prefix. The function also has the option to keep or | |
454 delete the input file once the BCF file has been created. | |
455 | |
456 Parameters | |
457 ---------- | |
458 filename : str | |
459 Filename of VCF-formatted input | |
460 output_prefix : str | |
461 Prefix of the BCF output (i.e. without file extension) | |
462 keep_original : bool, optional | |
463 If the input file should be kept once converted | |
464 ''' | |
82 | 465 |
83 # Holds the arguments to convert to BCF format | 466 # Holds the arguments to convert to BCF format |
84 convert_args = ['convert', '-O', 'b'] | 467 convert_args = ['convert', '-O', 'b'] |
85 | 468 |
86 # Stores the specified output_prefix to the BCF file | 469 # Stores the specified output_prefix to the BCF file |
93 convert_args.append(filename) | 476 convert_args.append(filename) |
94 | 477 |
95 # Call bcftools | 478 # Call bcftools |
96 call_bcftools(convert_args) | 479 call_bcftools(convert_args) |
97 | 480 |
98 | 481 # Delete the original file once the bcf file is created |
99 def convert_to_vcf (filename, output_prefix): | 482 if not keep_original: |
483 if check_for_index(filename) == True: | |
484 delete_index(filename) | |
485 os.remove(filename) | |
486 | |
487 def convert_to_vcf (filename, output_prefix, keep_original = False): | |
488 ''' | |
489 Converts a VCF-formatted file to VCF | |
490 | |
491 This function will convert a VCF-formatted file to VCF with the | |
492 specified filename prefix. The function also has the option to keep or | |
493 delete the input file once the VCF file has been created. | |
494 | |
495 Parameters | |
496 ---------- | |
497 filename : str | |
498 Filename of VCF-formatted input | |
499 output_prefix : str | |
500 Prefix of the VCF output (i.e. without file extension) | |
501 keep_original : bool, optional | |
502 If the input file should be kept once converted | |
503 ''' | |
100 | 504 |
101 # Holds the arguments to convert to VCF format | 505 # Holds the arguments to convert to VCF format |
102 convert_args = ['view', '-O', 'v'] | 506 convert_args = ['view', '-O', 'v'] |
103 | 507 |
104 # Stores the specified output_prefix to the VCF file | 508 # Stores the specified output_prefix to the VCF file |
111 convert_args.append(filename) | 515 convert_args.append(filename) |
112 | 516 |
113 # Call bcftools | 517 # Call bcftools |
114 call_bcftools(convert_args) | 518 call_bcftools(convert_args) |
115 | 519 |
116 def convert_to_vcfgz (filename, output_prefix): | 520 # Delete the original file once the vcf file is created |
521 if not keep_original: | |
522 if check_for_index(filename) == True: | |
523 delete_index(filename) | |
524 os.remove(filename) | |
525 | |
526 def convert_to_vcfgz (filename, output_prefix, keep_original = False): | |
527 ''' | |
528 Converts a VCF-formatted file to bgzipped-VCF | |
529 | |
530 This function will convert a VCF-formatted file to bgzipped-VCF with the | |
531 specified filename prefix. The function also has the option to keep or | |
532 delete the input file once the bgzipped-VCF file has been created. | |
533 | |
534 Parameters | |
535 ---------- | |
536 filename : str | |
537 Filename of VCF-formatted input | |
538 output_prefix : str | |
539 Prefix of the bgzipped-VCF output (i.e. without file extension) | |
540 keep_original : bool, optional | |
541 If the input file should be kept once converted | |
542 ''' | |
117 | 543 |
118 # Holds the arguments to convert to VCFGZ format | 544 # Holds the arguments to convert to VCFGZ format |
119 convert_args = ['view', '-O', 'z'] | 545 convert_args = ['view', '-O', 'z'] |
120 | 546 |
121 # Stores the specified output_prefix to the VCFGZ file | 547 # Stores the specified output_prefix to the VCFGZ file |
127 # Assigns the specified input to the arguments | 553 # Assigns the specified input to the arguments |
128 convert_args.append(filename) | 554 convert_args.append(filename) |
129 | 555 |
130 # Call bcftools | 556 # Call bcftools |
131 call_bcftools(convert_args) | 557 call_bcftools(convert_args) |
558 | |
559 # Delete the original file once the vcfgz file is created | |
560 if not keep_original: | |
561 if check_for_index(filename) == True: | |
562 delete_index(filename) | |
563 os.remove(filename) |