ppp_vcfphase: vcftools.py comparison

comparison vcftools.py @ 3:d1e3db7f6521 draft

Uploaded

author	jaredgk
date	Wed, 17 Oct 2018 17:28:38 -0400
parents
children

comparison

equal deleted inserted replaced

-:54c84f7dcb2c
+:d1e3db7f6521
+import os
+import sys
+import logging
+import subprocess
+sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
+from vcf_reader_func import checkFormat
+from bcftools import check_bcftools_for_errors
+def check_bgzip_for_errors (bgzip_stderr):
+'''
+Checks the bgzip stderr for errors
+Parameters
+----------
+bgzip_stderr : str
+bgzip stderr
+Raises
+------
+IOError
+If bgzip stderr returns an error
+'''
+if bgzip_stderr:
+raise IOError('Error occured while compressing the vcf file')
+def bgzip_decompress_vcfgz (vcfgz_filename, out_prefix = '', keep_original = False):
+'''
+Converts a vcf.gz to vcf
+The function automates bgzip to decompress a vcf.gz file into a vcf
+Parameters
+----------
+vcfgz_filename : str
+The file name of the vcf.gz file to be decompressed
+out_prefix : str
+Output file prefix (i.e. filename without extension)
+keep_original : bool
+Specifies if the original file should be kept
+Raises
+------
+IOError
+Error in creating the compressed file
+'''
+# Run bgzip with stdout piped to file
+if keep_original or out_prefix:
+if out_prefix:
+# Assign the bgzip filename
+vcf_filename = out_prefix + '.vcf'
+else:
+# Seperate into path and filename
+split_path, split_filename = os.path.split(vcfgz_filename)
+# Remove any file extensions
+vcf_basename = split_filename.split(os.extsep)[0] + '.vcf'
+# Join path and filename
+vcf_filename = os.path.join(split_path, vcf_basename)
+# Create the output file
+vcf_file = open(vcf_filename, 'w')
+# bgzip subprocess call
+bgzip_call = subprocess.Popen(['bgzip', '-dc', vcfgz_filename], stdout = vcf_file, stderr = subprocess.PIPE)
+# Run bgzip normally
+else:
+# bgzip subprocess call
+bgzip_call = subprocess.Popen(['bgzip', '-d', vcfgz_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+# Save the stdout and stderr from bgzip
+bgzip_out, bgzip_err = bgzip_call.communicate()
+# Check that output file was compressed correctly
+check_bgzip_for_errors(bgzip_err)
+# Delete input when also using an output prefix
+if out_prefix and not keep_original:
+os.remove(vcfgz_filename)
+def bgzip_compress_vcf (vcf_filename, out_prefix = '', keep_original = False):
+'''
+Converts a vcf to vcf.gz
+The function automates bgzip to compress a vcf file into a vcf.gz
+Parameters
+----------
+vcf_filename : str
+The file name of the vcf file to be compressed
+keep_original : bool
+Specifies if the original file should be kept
+Raises
+------
+IOError
+Error in creating the compressed file
+'''
+# Compress and keep the original file
+if keep_original or out_prefix:
+if out_prefix:
+# Assign the filename
+vcfgz_filename = out_prefix + '.vcf.gz'
+else:
+# Seperate into path and filename
+split_path, split_filename = os.path.split(vcfgz_filename)
+# Remove any file extensions
+vcfgz_basename = split_filename.split(os.extsep)[0] + '.vcf.gz'
+# Join path and filename
+vcfgz_filename = os.path.join(split_path, vcfgz_basename)
+# Create the output file
+vcfgz_file = open(vcfgz_filename, 'w')
+# bgzip subprocess call
+bgzip_call = subprocess.Popen(['bgzip', '-c', vcf_filename], stdout = vcfgz_file, stderr = subprocess.PIPE)
+# Compress and do not keep the original file
+else:
+# bgzip subprocess call
+bgzip_call = subprocess.Popen(['bgzip', vcf_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+# Save the stdout and stderr from bgzip
+bgzip_out, bgzip_err = bgzip_call.communicate()
+# Check that output file was compressed correctly
+check_bgzip_for_errors(bgzip_err)
+def cvt_vcftools_site_to_bed (vcftools_out_str):
+# Check if str in the header
+if 'CHROM' not in vcftools_out_str or 'POS' not in vcftools_out_str:
+# Split the line into a list
+vcftools_out_data = vcftools_out_str.strip().split('\t')
+# Convert the chromStart to int
+vcftools_out_data[1] = int(vcftools_out_data[1])
+# Calc chromEnd
+chrom_end = vcftools_out_data[1] + 1
+# Add chrom_end to the list
+vcftools_out_data = vcftools_out_data + [chrom_end]
+# Return the list as a string (with newline element)
+return '\t'.join(map(str, vcftools_out_data)) + '\n'
+else:
+# Remove the header
+return ''
+def pipe_vcftools (vcftools_call_args):
+'''
+Calls vcftools with pipe output
+The output of this function is the stdout and stderr of vcftools. This
+function should only be used if vcftools is being used as the stdin of
+another function. Please note that this function does not check the for
+errors in the vcftools call. Please check for errors after the call is
+closed using check_vcftools_for_errors.
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+Returns
+-------
+vcftools_call : subprocess.Popen
+vcftools subprocess call
+vcftools_call.stdout : PIPE
+vcftools stdout PIPE (Results)
+vcftools_call.stderr : PIPE
+vcftools stderr PIPE (Log)
+'''
+# vcftools subprocess call
+vcftools_call = subprocess.Popen(['vcftools', '--stdout'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+return vcftools_call
+def pipe_vcftools_to_bed_file (vcftools_call_args, output_filename):
+'''
+Pipes site-file output of vcftools to a bed formmated file
+The purpose of this function is to avoid creating large uncompressed
+vcf files by directly piping the output of vcftools to bgzip. This
+results in creating a vcf.gz file without any intermediates.
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+output_filename : str
+Filename of the bed file
+'''
+# Open vcftools pipe
+vcftools_call = pipe_vcftools(vcftools_call_args)
+# Create the bed file
+bed_output = open(output_filename, 'w')
+try:
+# Iterate the vcftools stdout unless error occurs
+for vcftools_stdout_line in iter(vcftools_call.stdout.readline, b''):
+bed_output.write(cvt_vcftools_site_to_bed(vcftools_stdout_line))
+# Close the bed file
+bed_output.close()
+except:
+# Close the bed file
+bed_output.close()
+# Delete the file
+os.remove(output_filename)
+# Wait for vctools to finish
+vcftools_call.wait()
+# Close the vcftools stdout
+vcftools_call.stdout.close()
+# Read the vcftools stderr
+vcftools_stderr = vcftools_call.stderr.read()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+vcftools_stderr = vcftools_stderr.decode()
+# Check that the log file was created correctly
+check_vcftools_for_errors(vcftools_stderr)
+logging.info('vcftools call complete')
+return vcftools_stderr
+def pipe_vcftools_bgzip (vcftools_call_args, output_filename):
+'''
+Pipes the output of vcftools to bgzip
+The purpose of this function is to avoid creating large uncompressed
+vcf files by directly piping the output of vcftools to bgzip. This
+results in creating a vcf.gz file without any intermediates.
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+output_filename : str
+Filename of the compressed vcf file
+'''
+vcftools_call = pipe_vcftools(vcftools_call_args)
+# Create bgzip output file
+bgzip_output = open(output_filename, 'wb')
+# bgzip subprocess call
+bgzip_call = subprocess.Popen(['bgzip'], stdin = vcftools_call.stdout, stdout = bgzip_output, stderr = subprocess.PIPE)
+# Wait for vctools to finish
+vcftools_call.wait()
+# Close the vcftools stdout
+vcftools_call.stdout.close()
+# Read the vcftools stderr
+vcftools_stderr = vcftools_call.stderr.read()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+vcftools_stderr = vcftools_stderr.decode()
+# Check that the log file was created correctly
+check_vcftools_for_errors(vcftools_stderr)
+# Wait for bgzip to finish
+bgzip_call.wait()
+# Close the compressed vcf file
+bgzip_output.close()
+# Save the stderr from bgzip, stdout = None
+bgzip_stdout, bgzip_stderr = bgzip_call.communicate()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+bgzip_stderr = bgzip_stderr.decode()
+# Check that output file was compressed correctly
+check_bgzip_for_errors(bgzip_stderr)
+logging.info('vcftools and bgzip calls complete')
+return vcftools_stderr
+def pipe_vcftools_bcftools (vcftools_call_args, output_filename):
+'''
+Pipes the output of vcftools to bcftools
+The purpose of this function is to avoid the vcftools command
+--recode-bcf that may result in malformed BCF files. To avoid large
+uncompressed intermediates, this function pipes the stdout of vcftools
+to bcftools.
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+output_filename : str
+Filename of the BCF file
+'''
+vcftools_call = pipe_vcftools(vcftools_call_args)
+# Holds the arguments to convert to BCF format
+convert_args = ['view', '-O', 'b']
+# Assigns the output file to the arguments
+convert_args.extend(['-o', output_filename])
+# bcftools subprocess call
+bcftools_call = subprocess.Popen(['bcftools'] + convert_args, stdin = vcftools_call.stdout, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+# Wait for vctools to finish
+vcftools_call.wait()
+# Close the vcftools stdout
+vcftools_call.stdout.close()
+# Read the vcftools stderr
+vcftools_stderr = vcftools_call.stderr.read()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+vcftools_stderr = vcftools_stderr.decode()
+# Check that the log file was created correctly
+check_vcftools_for_errors(vcftools_stderr)
+# Wait for bgzip to finish
+bcftools_call.wait()
+# Save the stderr from bgzip, stdout = None
+bcftools_stdout, bcftools_stderr = bcftools_call.communicate()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+bcftools_stderr = bcftools_stderr.decode()
+# Check that output file was compressed correctly
+check_bcftools_for_errors(bcftools_stderr)
+logging.info('vcftools and bcftools calls complete')
+return vcftools_stderr
+def pipe_vcftools_to_file (vcftools_call_args, output_filename, append_output = False):
+'''
+Pipes file output of vcftools to a standard file
+The function calls vcftools. Returns the stderr of vcftools to
+create log file of the call. The function may be used to append multiple
+calls to vcftools to a single file
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+append_output : bool
+The output file should be written in append mode
+Returns
+-------
+vcftools_err : str
+vcftools log output
+Raises
+------
+Exception
+If vcftools stderr returns an error
+'''
+# Open vcftools pipe
+vcftools_call = pipe_vcftools(vcftools_call_args)
+# Check if the output should be opened in append mode
+if append_output:
+# Create the output file (in append mode)
+output_file = open(output_filename, 'a')
+else:
+# Create the output file (in write mode)
+output_file = open(output_filename, 'w')
+try:
+# Create iterator of the vcftools stdout
+stdout_iter = iter(vcftools_call.stdout.readline, b'')
+# Check if the output is being appended and the file is empty
+if append_output and os.stat(output_filename).st_size != 0:
+# Skip the header if the file isn't empty and appending
+next(stdout_iter)
+# Iterate the vcftools stdout
+for vcftools_stdout_line in stdout_iter:
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+vcftools_stdout_line = vcftools_stdout_line.decode()
+output_file.write(vcftools_stdout_line)
+# Close the bed file
+output_file.close()
+except:
+# Close the bed file
+output_file.close()
+# Delete the file
+os.remove(output_filename)
+raise Exception('vcftools to python pipe error')
+# Wait for vctools to finish
+vcftools_call.wait()
+# Close the vcftools stdout
+vcftools_call.stdout.close()
+# Read the vcftools stderr
+vcftools_stderr = vcftools_call.stderr.read()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+vcftools_stderr = vcftools_stderr.decode()
+# Check that the log file was created correctly
+check_vcftools_for_errors(vcftools_stderr)
+logging.info('vcftools call complete')
+return vcftools_stderr
+def standard_vcftools_call (vcftools_call_args):
+'''
+Calls vcftools
+The function calls vcftools. Returns the stderr of vcftools to
+create log file of the call.
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+Returns
+-------
+vcftools_out : str
+vcftools call output
+vcftools_err : str
+vcftools log output
+Raises
+------
+Exception
+If vcftools stderr returns an error
+'''
+# vcftools subprocess call without stdout
+vcftools_call = subprocess.Popen(['vcftools'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+# Wait for vcftools to finish
+vcftools_stdout, vcftools_stderr = vcftools_call.communicate()
+# Check if code is running in python 3
+if sys.version_info[0] == 3:
+# Convert bytes to string
+vcftools_stderr = vcftools_stderr.decode()
+logging.info('vcftools call complete')
+# Check that the log file was created correctly
+check_vcftools_for_errors(vcftools_stderr)
+return vcftools_stderr
+def call_vcftools (vcftools_call_args, output_format = None, output_filename = None):
+'''
+Calls vcftools
+The function calls vcftools. Returns the stderr of vcftools to
+create log file of the call.
+Parameters
+----------
+vcftools_call_args : list
+vcftools arguments
+output_format : str
+The output format
+output_filename : str
+The output filename assigned by vcftools (for piped calls)
+Returns
+-------
+vcftools_out : str
+vcftools call output
+vcftools_err : str
+vcftools log output
+Raises
+------
+Exception
+If vcftools stderr returns an error
+'''
+# Check if the output is a bgzipped vcf
+if output_format == 'vcf.gz':
+# Pipe vcftools stdout to bgzip to create a bgzipped vcf
+vcftools_err = pipe_vcftools_bgzip(vcftools_call_args, output_filename)
+# Check if the output is a bcf
+elif output_format == 'bcf':
+# Pipe vcftools stdout to bgzip to create a bgzipped vcf
+vcftools_err = pipe_vcftools_bcftools(vcftools_call_args, output_filename)
+elif output_format == 'removed_bed' or output_format == 'kept_bed':
+# Pipe vcftools stdout to bed file
+vcftools_err = pipe_vcftools_to_bed_file(vcftools_call_args, output_filename)
+elif output_format == 'het-fis':
+vcftools_err = pipe_vcftools_to_file(vcftools_call_args, output_filename, append_output = True)
+else:
+# Call vcftools under standard conditions
+vcftools_err = standard_vcftools_call(vcftools_call_args)
+# Return the log
+return vcftools_err
+def check_for_vcftools_output (vcftools_output):
+'''
+Checks for the previous vcftools output
+Confirms that neither a previous vcftools log or output file exists.
+Parameters
+----------
+vcftools_output : str
+Specifies the output filename to be checked
+Raises
+------
+IOError
+If the vcftools output file exists
+IOError
+If the vcftools log file exists
+'''
+# Check if output file already exists
+if os.path.isfile(vcftools_output):
+raise IOError('VCF output file already exists')
+logging.info('Output file assigned')
+# Check if log file already exists
+if os.path.isfile(vcftools_output + '.log'):
+raise IOError('Log file already exists')
+logging.info('Log file assigned')
+def delete_vcftools_output (vcftools_output):
+'''
+Deletes previous vcftools output
+Confirms if previous vcftools output exists, and if so, deletes it
+Parameters
+----------
+vcftools_output : str
+Specifies the output filename to be deleted
+Raises
+------
+IOError
+If the vcftools output cannot be deleted
+IOError
+If the vcftools log cannot be deleted
+'''
+# Check if output file already exists
+if os.path.isfile(vcftools_output):
+try:
+# Delete the output
+os.remove(vcftools_output)
+except:
+raise IOError('VCF output file cannot be deleted')
+logging.info('Output file assigned')
+# Check if log file already exists
+if os.path.isfile(vcftools_output + '.log'):
+try:
+# Delete the output
+os.remove(vcftools_output + '.log')
+except:
+raise IOError('Log file cannot be deleted')
+logging.info('Log file assigned')
+def check_vcftools_for_errors (vcftools_stderr):
+'''
+Checks the vcftools stderr for errors
+Parameters
+----------
+vcftools_stderr : str
+vcftools stderr
+Raises
+------
+IOError
+If vcftools stderr returns an error
+'''
+# Returns True if the job completed without error
+if 'Run Time' in str(vcftools_stderr):
+pass
+# Print output for vcftools if error is detected
+elif 'Error' in str(vcftools_stderr):
+# Splits log into list of lines
+vcftools_stderr_lines = vcftools_stderr.splitlines()
+# Prints the error(s)
+raise Exception('\n'.join((output_line for output_line in vcftools_stderr_lines if output_line.startswith('Error'))))
+# Print output if not completed and no error found. Unlikely to be used, but included.
+else:
+raise Exception(vcftools_stderr)
+def produce_vcftools_output (output, filename, append_mode = False, strip_header = False):
+'''
+Creates the vcftools output file
+This function will create an output file from the vcftools stdout.
+Please run `check_vcftools_for_errors` prior to check that vcftools
+finished without error.
+Parameters
+----------
+output : str
+vcftools stdout
+filename : str
+Specifies the filename for the output file
+append_mode : bool
+Used to create a single output file from multiple calls
+strip_header : bool
+Used to remove the header if not needed
+Returns
+-------
+output : file
+vcftools output file
+'''
+# Check if the header should be stripped
+if strip_header:
+output = ''.join(output.splitlines(True)[1:])
+# Check if single log file is required from multiple calls
+if append_mode:
+vcftools_log_file = open(filename,'a')
+else:
+vcftools_log_file = open(filename,'w')
+vcftools_log_file.write(str(output))
+vcftools_log_file.close()
+def produce_vcftools_log (output, filename, append_mode = False):
+'''
+Creates the vcftools log file
+This function will create a log file from the vcftools stderr. Please
+run `check_vcftools_for_errors` prior to check that vcftools finished
+without error.
+Parameters
+----------
+output : str
+vcftools stderr
+filename : str
+Specifies the filename for the log file
+append_mode : bool
+Used to create a single log file from multiple calls
+Returns
+-------
+output : file
+vcftools log file
+'''
+# Check if single log file is required from multiple calls
+if append_mode:
+vcftools_log_file = open(filename + '.log','a')
+else:
+vcftools_log_file = open(filename + '.log','w')
+vcftools_log_file.write(str(output))
+vcftools_log_file.close()
+def assign_vcftools_input_arg (filename):
+'''
+Confirms file format for vcftools
+Parameters
+----------
+filename : str
+Specifies the input filename of unknown format
+Returns
+-------
+list
+Returns vcftools input command for `filename`
+Raises
+------
+IOError
+If filename is an unknown file format
+'''
+# True if file extensions is recognized by vcftools
+if filename.endswith('.vcf') or filename.endswith('.vcf.gz') or filename.endswith('.bcf'):
+# Assign the associated input command
+if filename.endswith('.vcf'):
+return ['--vcf', filename]
+elif filename.endswith('.vcf.gz'):
+return ['--gzvcf', filename]
+elif filename.endswith('.bcf'):
+return ['--bcf', filename]
+# True if file extension is unknown or not recognized
+else:
+# Checks if the file is unzipped, bgzipped, or gzipped
+vcfname_format = checkFormat(filename)
+# Assign the associated input command, or return an error.
+if vcfname_format == 'vcf':
+return ['--vcf', filename]
+elif vcfname_format == 'bgzip':
+return ['--gzvcf', filename]
+elif vcfname_format == 'bcf':
+return ['--bcf', filename]
+else:
+raise Exception('Unknown VCF file format')

Mercurial > repos > jaredgk > ppp_vcfphase

comparison vcftools.py @ 3:d1e3db7f6521 draft