comparison vcftools.py @ 3:d1e3db7f6521 draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:28:38 -0400
parents
children
comparison
equal deleted inserted replaced
2:54c84f7dcb2c 3:d1e3db7f6521
1 import os
2 import sys
3 import logging
4 import subprocess
5
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
7
8 from vcf_reader_func import checkFormat
9 from bcftools import check_bcftools_for_errors
10
11 def check_bgzip_for_errors (bgzip_stderr):
12 '''
13 Checks the bgzip stderr for errors
14
15 Parameters
16 ----------
17 bgzip_stderr : str
18 bgzip stderr
19
20 Raises
21 ------
22 IOError
23 If bgzip stderr returns an error
24 '''
25
26 if bgzip_stderr:
27 raise IOError('Error occured while compressing the vcf file')
28
29 def bgzip_decompress_vcfgz (vcfgz_filename, out_prefix = '', keep_original = False):
30 '''
31 Converts a vcf.gz to vcf
32
33 The function automates bgzip to decompress a vcf.gz file into a vcf
34
35 Parameters
36 ----------
37 vcfgz_filename : str
38 The file name of the vcf.gz file to be decompressed
39 out_prefix : str
40 Output file prefix (i.e. filename without extension)
41 keep_original : bool
42 Specifies if the original file should be kept
43
44 Raises
45 ------
46 IOError
47 Error in creating the compressed file
48 '''
49
50 # Run bgzip with stdout piped to file
51 if keep_original or out_prefix:
52
53 if out_prefix:
54
55 # Assign the bgzip filename
56 vcf_filename = out_prefix + '.vcf'
57
58 else:
59
60 # Seperate into path and filename
61 split_path, split_filename = os.path.split(vcfgz_filename)
62
63 # Remove any file extensions
64 vcf_basename = split_filename.split(os.extsep)[0] + '.vcf'
65
66 # Join path and filename
67 vcf_filename = os.path.join(split_path, vcf_basename)
68
69 # Create the output file
70 vcf_file = open(vcf_filename, 'w')
71
72 # bgzip subprocess call
73 bgzip_call = subprocess.Popen(['bgzip', '-dc', vcfgz_filename], stdout = vcf_file, stderr = subprocess.PIPE)
74
75 # Run bgzip normally
76 else:
77
78 # bgzip subprocess call
79 bgzip_call = subprocess.Popen(['bgzip', '-d', vcfgz_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
80
81 # Save the stdout and stderr from bgzip
82 bgzip_out, bgzip_err = bgzip_call.communicate()
83
84 # Check that output file was compressed correctly
85 check_bgzip_for_errors(bgzip_err)
86
87 # Delete input when also using an output prefix
88 if out_prefix and not keep_original:
89 os.remove(vcfgz_filename)
90
91 def bgzip_compress_vcf (vcf_filename, out_prefix = '', keep_original = False):
92 '''
93 Converts a vcf to vcf.gz
94
95 The function automates bgzip to compress a vcf file into a vcf.gz
96
97 Parameters
98 ----------
99 vcf_filename : str
100 The file name of the vcf file to be compressed
101 keep_original : bool
102 Specifies if the original file should be kept
103
104 Raises
105 ------
106 IOError
107 Error in creating the compressed file
108 '''
109
110 # Compress and keep the original file
111 if keep_original or out_prefix:
112
113 if out_prefix:
114
115 # Assign the filename
116 vcfgz_filename = out_prefix + '.vcf.gz'
117
118 else:
119
120 # Seperate into path and filename
121 split_path, split_filename = os.path.split(vcfgz_filename)
122
123 # Remove any file extensions
124 vcfgz_basename = split_filename.split(os.extsep)[0] + '.vcf.gz'
125
126 # Join path and filename
127 vcfgz_filename = os.path.join(split_path, vcfgz_basename)
128
129
130 # Create the output file
131 vcfgz_file = open(vcfgz_filename, 'w')
132
133 # bgzip subprocess call
134 bgzip_call = subprocess.Popen(['bgzip', '-c', vcf_filename], stdout = vcfgz_file, stderr = subprocess.PIPE)
135
136 # Compress and do not keep the original file
137 else:
138
139 # bgzip subprocess call
140 bgzip_call = subprocess.Popen(['bgzip', vcf_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
141
142 # Save the stdout and stderr from bgzip
143 bgzip_out, bgzip_err = bgzip_call.communicate()
144
145 # Check that output file was compressed correctly
146 check_bgzip_for_errors(bgzip_err)
147
148 def cvt_vcftools_site_to_bed (vcftools_out_str):
149 # Check if str in the header
150 if 'CHROM' not in vcftools_out_str or 'POS' not in vcftools_out_str:
151 # Split the line into a list
152 vcftools_out_data = vcftools_out_str.strip().split('\t')
153 # Convert the chromStart to int
154 vcftools_out_data[1] = int(vcftools_out_data[1])
155 # Calc chromEnd
156 chrom_end = vcftools_out_data[1] + 1
157 # Add chrom_end to the list
158 vcftools_out_data = vcftools_out_data + [chrom_end]
159 # Return the list as a string (with newline element)
160 return '\t'.join(map(str, vcftools_out_data)) + '\n'
161 else:
162 # Remove the header
163 return ''
164
165 def pipe_vcftools (vcftools_call_args):
166 '''
167 Calls vcftools with pipe output
168
169 The output of this function is the stdout and stderr of vcftools. This
170 function should only be used if vcftools is being used as the stdin of
171 another function. Please note that this function does not check the for
172 errors in the vcftools call. Please check for errors after the call is
173 closed using check_vcftools_for_errors.
174
175 Parameters
176 ----------
177 vcftools_call_args : list
178 vcftools arguments
179
180 Returns
181 -------
182 vcftools_call : subprocess.Popen
183 vcftools subprocess call
184 vcftools_call.stdout : PIPE
185 vcftools stdout PIPE (Results)
186 vcftools_call.stderr : PIPE
187 vcftools stderr PIPE (Log)
188
189 '''
190
191 # vcftools subprocess call
192 vcftools_call = subprocess.Popen(['vcftools', '--stdout'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
193
194 return vcftools_call
195
196 def pipe_vcftools_to_bed_file (vcftools_call_args, output_filename):
197
198 '''
199 Pipes site-file output of vcftools to a bed formmated file
200
201 The purpose of this function is to avoid creating large uncompressed
202 vcf files by directly piping the output of vcftools to bgzip. This
203 results in creating a vcf.gz file without any intermediates.
204
205 Parameters
206 ----------
207 vcftools_call_args : list
208 vcftools arguments
209 output_filename : str
210 Filename of the bed file
211
212 '''
213 # Open vcftools pipe
214 vcftools_call = pipe_vcftools(vcftools_call_args)
215
216 # Create the bed file
217 bed_output = open(output_filename, 'w')
218
219 try:
220 # Iterate the vcftools stdout unless error occurs
221 for vcftools_stdout_line in iter(vcftools_call.stdout.readline, b''):
222 bed_output.write(cvt_vcftools_site_to_bed(vcftools_stdout_line))
223 # Close the bed file
224 bed_output.close()
225 except:
226 # Close the bed file
227 bed_output.close()
228 # Delete the file
229 os.remove(output_filename)
230
231 # Wait for vctools to finish
232 vcftools_call.wait()
233
234 # Close the vcftools stdout
235 vcftools_call.stdout.close()
236
237 # Read the vcftools stderr
238 vcftools_stderr = vcftools_call.stderr.read()
239
240 # Check if code is running in python 3
241 if sys.version_info[0] == 3:
242 # Convert bytes to string
243 vcftools_stderr = vcftools_stderr.decode()
244
245 # Check that the log file was created correctly
246 check_vcftools_for_errors(vcftools_stderr)
247
248 logging.info('vcftools call complete')
249
250 return vcftools_stderr
251
252 def pipe_vcftools_bgzip (vcftools_call_args, output_filename):
253 '''
254 Pipes the output of vcftools to bgzip
255
256 The purpose of this function is to avoid creating large uncompressed
257 vcf files by directly piping the output of vcftools to bgzip. This
258 results in creating a vcf.gz file without any intermediates.
259
260 Parameters
261 ----------
262 vcftools_call_args : list
263 vcftools arguments
264 output_filename : str
265 Filename of the compressed vcf file
266
267 '''
268
269 vcftools_call = pipe_vcftools(vcftools_call_args)
270
271 # Create bgzip output file
272 bgzip_output = open(output_filename, 'wb')
273
274 # bgzip subprocess call
275 bgzip_call = subprocess.Popen(['bgzip'], stdin = vcftools_call.stdout, stdout = bgzip_output, stderr = subprocess.PIPE)
276
277 # Wait for vctools to finish
278 vcftools_call.wait()
279
280 # Close the vcftools stdout
281 vcftools_call.stdout.close()
282
283 # Read the vcftools stderr
284 vcftools_stderr = vcftools_call.stderr.read()
285
286 # Check if code is running in python 3
287 if sys.version_info[0] == 3:
288 # Convert bytes to string
289 vcftools_stderr = vcftools_stderr.decode()
290
291 # Check that the log file was created correctly
292 check_vcftools_for_errors(vcftools_stderr)
293
294 # Wait for bgzip to finish
295 bgzip_call.wait()
296
297 # Close the compressed vcf file
298 bgzip_output.close()
299
300 # Save the stderr from bgzip, stdout = None
301 bgzip_stdout, bgzip_stderr = bgzip_call.communicate()
302
303 # Check if code is running in python 3
304 if sys.version_info[0] == 3:
305 # Convert bytes to string
306 bgzip_stderr = bgzip_stderr.decode()
307
308 # Check that output file was compressed correctly
309 check_bgzip_for_errors(bgzip_stderr)
310
311 logging.info('vcftools and bgzip calls complete')
312
313 return vcftools_stderr
314
315 def pipe_vcftools_bcftools (vcftools_call_args, output_filename):
316 '''
317 Pipes the output of vcftools to bcftools
318
319 The purpose of this function is to avoid the vcftools command
320 --recode-bcf that may result in malformed BCF files. To avoid large
321 uncompressed intermediates, this function pipes the stdout of vcftools
322 to bcftools.
323
324 Parameters
325 ----------
326 vcftools_call_args : list
327 vcftools arguments
328 output_filename : str
329 Filename of the BCF file
330
331 '''
332
333 vcftools_call = pipe_vcftools(vcftools_call_args)
334
335 # Holds the arguments to convert to BCF format
336 convert_args = ['view', '-O', 'b']
337
338 # Assigns the output file to the arguments
339 convert_args.extend(['-o', output_filename])
340
341 # bcftools subprocess call
342 bcftools_call = subprocess.Popen(['bcftools'] + convert_args, stdin = vcftools_call.stdout, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
343
344 # Wait for vctools to finish
345 vcftools_call.wait()
346
347 # Close the vcftools stdout
348 vcftools_call.stdout.close()
349
350 # Read the vcftools stderr
351 vcftools_stderr = vcftools_call.stderr.read()
352
353 # Check if code is running in python 3
354 if sys.version_info[0] == 3:
355 # Convert bytes to string
356 vcftools_stderr = vcftools_stderr.decode()
357
358 # Check that the log file was created correctly
359 check_vcftools_for_errors(vcftools_stderr)
360
361 # Wait for bgzip to finish
362 bcftools_call.wait()
363
364 # Save the stderr from bgzip, stdout = None
365 bcftools_stdout, bcftools_stderr = bcftools_call.communicate()
366
367 # Check if code is running in python 3
368 if sys.version_info[0] == 3:
369 # Convert bytes to string
370 bcftools_stderr = bcftools_stderr.decode()
371
372 # Check that output file was compressed correctly
373 check_bcftools_for_errors(bcftools_stderr)
374
375 logging.info('vcftools and bcftools calls complete')
376
377 return vcftools_stderr
378
379 def pipe_vcftools_to_file (vcftools_call_args, output_filename, append_output = False):
380 '''
381 Pipes file output of vcftools to a standard file
382
383 The function calls vcftools. Returns the stderr of vcftools to
384 create log file of the call. The function may be used to append multiple
385 calls to vcftools to a single file
386
387 Parameters
388 ----------
389 vcftools_call_args : list
390 vcftools arguments
391 append_output : bool
392 The output file should be written in append mode
393
394 Returns
395 -------
396 vcftools_err : str
397 vcftools log output
398
399 Raises
400 ------
401 Exception
402 If vcftools stderr returns an error
403 '''
404
405 # Open vcftools pipe
406 vcftools_call = pipe_vcftools(vcftools_call_args)
407
408 # Check if the output should be opened in append mode
409 if append_output:
410 # Create the output file (in append mode)
411 output_file = open(output_filename, 'a')
412 else:
413 # Create the output file (in write mode)
414 output_file = open(output_filename, 'w')
415
416
417 try:
418 # Create iterator of the vcftools stdout
419 stdout_iter = iter(vcftools_call.stdout.readline, b'')
420
421 # Check if the output is being appended and the file is empty
422 if append_output and os.stat(output_filename).st_size != 0:
423 # Skip the header if the file isn't empty and appending
424 next(stdout_iter)
425
426 # Iterate the vcftools stdout
427 for vcftools_stdout_line in stdout_iter:
428
429 # Check if code is running in python 3
430 if sys.version_info[0] == 3:
431 # Convert bytes to string
432 vcftools_stdout_line = vcftools_stdout_line.decode()
433
434 output_file.write(vcftools_stdout_line)
435
436 # Close the bed file
437 output_file.close()
438
439 except:
440 # Close the bed file
441 output_file.close()
442 # Delete the file
443 os.remove(output_filename)
444
445 raise Exception('vcftools to python pipe error')
446
447 # Wait for vctools to finish
448 vcftools_call.wait()
449
450 # Close the vcftools stdout
451 vcftools_call.stdout.close()
452
453 # Read the vcftools stderr
454 vcftools_stderr = vcftools_call.stderr.read()
455
456 # Check if code is running in python 3
457 if sys.version_info[0] == 3:
458 # Convert bytes to string
459 vcftools_stderr = vcftools_stderr.decode()
460
461 # Check that the log file was created correctly
462 check_vcftools_for_errors(vcftools_stderr)
463
464 logging.info('vcftools call complete')
465
466 return vcftools_stderr
467
468 def standard_vcftools_call (vcftools_call_args):
469 '''
470 Calls vcftools
471
472 The function calls vcftools. Returns the stderr of vcftools to
473 create log file of the call.
474
475 Parameters
476 ----------
477 vcftools_call_args : list
478 vcftools arguments
479
480 Returns
481 -------
482 vcftools_out : str
483 vcftools call output
484 vcftools_err : str
485 vcftools log output
486
487 Raises
488 ------
489 Exception
490 If vcftools stderr returns an error
491 '''
492
493 # vcftools subprocess call without stdout
494 vcftools_call = subprocess.Popen(['vcftools'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
495
496 # Wait for vcftools to finish
497 vcftools_stdout, vcftools_stderr = vcftools_call.communicate()
498
499 # Check if code is running in python 3
500 if sys.version_info[0] == 3:
501 # Convert bytes to string
502 vcftools_stderr = vcftools_stderr.decode()
503
504 logging.info('vcftools call complete')
505
506 # Check that the log file was created correctly
507 check_vcftools_for_errors(vcftools_stderr)
508
509 return vcftools_stderr
510
511 def call_vcftools (vcftools_call_args, output_format = None, output_filename = None):
512 '''
513 Calls vcftools
514
515 The function calls vcftools. Returns the stderr of vcftools to
516 create log file of the call.
517
518 Parameters
519 ----------
520 vcftools_call_args : list
521 vcftools arguments
522 output_format : str
523 The output format
524 output_filename : str
525 The output filename assigned by vcftools (for piped calls)
526
527 Returns
528 -------
529 vcftools_out : str
530 vcftools call output
531 vcftools_err : str
532 vcftools log output
533
534 Raises
535 ------
536 Exception
537 If vcftools stderr returns an error
538 '''
539
540 # Check if the output is a bgzipped vcf
541 if output_format == 'vcf.gz':
542 # Pipe vcftools stdout to bgzip to create a bgzipped vcf
543 vcftools_err = pipe_vcftools_bgzip(vcftools_call_args, output_filename)
544 # Check if the output is a bcf
545 elif output_format == 'bcf':
546 # Pipe vcftools stdout to bgzip to create a bgzipped vcf
547 vcftools_err = pipe_vcftools_bcftools(vcftools_call_args, output_filename)
548 elif output_format == 'removed_bed' or output_format == 'kept_bed':
549 # Pipe vcftools stdout to bed file
550 vcftools_err = pipe_vcftools_to_bed_file(vcftools_call_args, output_filename)
551 elif output_format == 'het-fis':
552 vcftools_err = pipe_vcftools_to_file(vcftools_call_args, output_filename, append_output = True)
553 else:
554 # Call vcftools under standard conditions
555 vcftools_err = standard_vcftools_call(vcftools_call_args)
556
557 # Return the log
558 return vcftools_err
559
560 def check_for_vcftools_output (vcftools_output):
561 '''
562 Checks for the previous vcftools output
563
564 Confirms that neither a previous vcftools log or output file exists.
565
566 Parameters
567 ----------
568 vcftools_output : str
569 Specifies the output filename to be checked
570
571 Raises
572 ------
573 IOError
574 If the vcftools output file exists
575 IOError
576 If the vcftools log file exists
577
578 '''
579 # Check if output file already exists
580 if os.path.isfile(vcftools_output):
581 raise IOError('VCF output file already exists')
582
583 logging.info('Output file assigned')
584
585 # Check if log file already exists
586 if os.path.isfile(vcftools_output + '.log'):
587 raise IOError('Log file already exists')
588
589 logging.info('Log file assigned')
590
591 def delete_vcftools_output (vcftools_output):
592 '''
593 Deletes previous vcftools output
594
595 Confirms if previous vcftools output exists, and if so, deletes it
596
597 Parameters
598 ----------
599 vcftools_output : str
600 Specifies the output filename to be deleted
601
602 Raises
603 ------
604 IOError
605 If the vcftools output cannot be deleted
606 IOError
607 If the vcftools log cannot be deleted
608 '''
609
610 # Check if output file already exists
611 if os.path.isfile(vcftools_output):
612 try:
613 # Delete the output
614 os.remove(vcftools_output)
615 except:
616 raise IOError('VCF output file cannot be deleted')
617
618 logging.info('Output file assigned')
619
620 # Check if log file already exists
621 if os.path.isfile(vcftools_output + '.log'):
622 try:
623 # Delete the output
624 os.remove(vcftools_output + '.log')
625 except:
626 raise IOError('Log file cannot be deleted')
627
628 logging.info('Log file assigned')
629
630 def check_vcftools_for_errors (vcftools_stderr):
631 '''
632 Checks the vcftools stderr for errors
633
634 Parameters
635 ----------
636 vcftools_stderr : str
637 vcftools stderr
638
639 Raises
640 ------
641 IOError
642 If vcftools stderr returns an error
643 '''
644
645 # Returns True if the job completed without error
646 if 'Run Time' in str(vcftools_stderr):
647 pass
648
649 # Print output for vcftools if error is detected
650 elif 'Error' in str(vcftools_stderr):
651 # Splits log into list of lines
652 vcftools_stderr_lines = vcftools_stderr.splitlines()
653 # Prints the error(s)
654 raise Exception('\n'.join((output_line for output_line in vcftools_stderr_lines if output_line.startswith('Error'))))
655
656 # Print output if not completed and no error found. Unlikely to be used, but included.
657 else:
658 raise Exception(vcftools_stderr)
659
660 def produce_vcftools_output (output, filename, append_mode = False, strip_header = False):
661 '''
662 Creates the vcftools output file
663
664 This function will create an output file from the vcftools stdout.
665 Please run `check_vcftools_for_errors` prior to check that vcftools
666 finished without error.
667
668 Parameters
669 ----------
670 output : str
671 vcftools stdout
672 filename : str
673 Specifies the filename for the output file
674 append_mode : bool
675 Used to create a single output file from multiple calls
676 strip_header : bool
677 Used to remove the header if not needed
678
679 Returns
680 -------
681 output : file
682 vcftools output file
683
684 '''
685
686 # Check if the header should be stripped
687 if strip_header:
688 output = ''.join(output.splitlines(True)[1:])
689
690 # Check if single log file is required from multiple calls
691 if append_mode:
692 vcftools_log_file = open(filename,'a')
693 else:
694 vcftools_log_file = open(filename,'w')
695
696 vcftools_log_file.write(str(output))
697 vcftools_log_file.close()
698
699 def produce_vcftools_log (output, filename, append_mode = False):
700 '''
701 Creates the vcftools log file
702
703 This function will create a log file from the vcftools stderr. Please
704 run `check_vcftools_for_errors` prior to check that vcftools finished
705 without error.
706
707 Parameters
708 ----------
709 output : str
710 vcftools stderr
711 filename : str
712 Specifies the filename for the log file
713 append_mode : bool
714 Used to create a single log file from multiple calls
715
716 Returns
717 -------
718 output : file
719 vcftools log file
720
721 '''
722 # Check if single log file is required from multiple calls
723 if append_mode:
724 vcftools_log_file = open(filename + '.log','a')
725 else:
726 vcftools_log_file = open(filename + '.log','w')
727
728 vcftools_log_file.write(str(output))
729 vcftools_log_file.close()
730
731 def assign_vcftools_input_arg (filename):
732 '''
733 Confirms file format for vcftools
734
735 Parameters
736 ----------
737 filename : str
738 Specifies the input filename of unknown format
739
740 Returns
741 -------
742 list
743 Returns vcftools input command for `filename`
744
745 Raises
746 ------
747 IOError
748 If filename is an unknown file format
749 '''
750
751 # True if file extensions is recognized by vcftools
752 if filename.endswith('.vcf') or filename.endswith('.vcf.gz') or filename.endswith('.bcf'):
753 # Assign the associated input command
754 if filename.endswith('.vcf'):
755 return ['--vcf', filename]
756 elif filename.endswith('.vcf.gz'):
757 return ['--gzvcf', filename]
758 elif filename.endswith('.bcf'):
759 return ['--bcf', filename]
760
761 # True if file extension is unknown or not recognized
762 else:
763
764 # Checks if the file is unzipped, bgzipped, or gzipped
765 vcfname_format = checkFormat(filename)
766
767 # Assign the associated input command, or return an error.
768 if vcfname_format == 'vcf':
769 return ['--vcf', filename]
770 elif vcfname_format == 'bgzip':
771 return ['--gzvcf', filename]
772 elif vcfname_format == 'bcf':
773 return ['--bcf', filename]
774 else:
775 raise Exception('Unknown VCF file format')