Mercurial > repos > jaredgk > ppp_vcfphase
changeset 3:d1e3db7f6521 draft
Uploaded
author | jaredgk |
---|---|
date | Wed, 17 Oct 2018 17:28:38 -0400 |
parents | 54c84f7dcb2c |
children | 901857c9b24f |
files | vcftools.py |
diffstat | 1 files changed, 775 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcftools.py Wed Oct 17 17:28:38 2018 -0400 @@ -0,0 +1,775 @@ +import os +import sys +import logging +import subprocess + +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) + +from vcf_reader_func import checkFormat +from bcftools import check_bcftools_for_errors + +def check_bgzip_for_errors (bgzip_stderr): + ''' + Checks the bgzip stderr for errors + + Parameters + ---------- + bgzip_stderr : str + bgzip stderr + + Raises + ------ + IOError + If bgzip stderr returns an error + ''' + + if bgzip_stderr: + raise IOError('Error occured while compressing the vcf file') + +def bgzip_decompress_vcfgz (vcfgz_filename, out_prefix = '', keep_original = False): + ''' + Converts a vcf.gz to vcf + + The function automates bgzip to decompress a vcf.gz file into a vcf + + Parameters + ---------- + vcfgz_filename : str + The file name of the vcf.gz file to be decompressed + out_prefix : str + Output file prefix (i.e. filename without extension) + keep_original : bool + Specifies if the original file should be kept + + Raises + ------ + IOError + Error in creating the compressed file + ''' + + # Run bgzip with stdout piped to file + if keep_original or out_prefix: + + if out_prefix: + + # Assign the bgzip filename + vcf_filename = out_prefix + '.vcf' + + else: + + # Seperate into path and filename + split_path, split_filename = os.path.split(vcfgz_filename) + + # Remove any file extensions + vcf_basename = split_filename.split(os.extsep)[0] + '.vcf' + + # Join path and filename + vcf_filename = os.path.join(split_path, vcf_basename) + + # Create the output file + vcf_file = open(vcf_filename, 'w') + + # bgzip subprocess call + bgzip_call = subprocess.Popen(['bgzip', '-dc', vcfgz_filename], stdout = vcf_file, stderr = subprocess.PIPE) + + # Run bgzip normally + else: + + # bgzip subprocess call + bgzip_call = subprocess.Popen(['bgzip', '-d', vcfgz_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE) + + # Save the stdout and stderr from bgzip + bgzip_out, bgzip_err = bgzip_call.communicate() + + # Check that output file was compressed correctly + check_bgzip_for_errors(bgzip_err) + + # Delete input when also using an output prefix + if out_prefix and not keep_original: + os.remove(vcfgz_filename) + +def bgzip_compress_vcf (vcf_filename, out_prefix = '', keep_original = False): + ''' + Converts a vcf to vcf.gz + + The function automates bgzip to compress a vcf file into a vcf.gz + + Parameters + ---------- + vcf_filename : str + The file name of the vcf file to be compressed + keep_original : bool + Specifies if the original file should be kept + + Raises + ------ + IOError + Error in creating the compressed file + ''' + + # Compress and keep the original file + if keep_original or out_prefix: + + if out_prefix: + + # Assign the filename + vcfgz_filename = out_prefix + '.vcf.gz' + + else: + + # Seperate into path and filename + split_path, split_filename = os.path.split(vcfgz_filename) + + # Remove any file extensions + vcfgz_basename = split_filename.split(os.extsep)[0] + '.vcf.gz' + + # Join path and filename + vcfgz_filename = os.path.join(split_path, vcfgz_basename) + + + # Create the output file + vcfgz_file = open(vcfgz_filename, 'w') + + # bgzip subprocess call + bgzip_call = subprocess.Popen(['bgzip', '-c', vcf_filename], stdout = vcfgz_file, stderr = subprocess.PIPE) + + # Compress and do not keep the original file + else: + + # bgzip subprocess call + bgzip_call = subprocess.Popen(['bgzip', vcf_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE) + + # Save the stdout and stderr from bgzip + bgzip_out, bgzip_err = bgzip_call.communicate() + + # Check that output file was compressed correctly + check_bgzip_for_errors(bgzip_err) + +def cvt_vcftools_site_to_bed (vcftools_out_str): + # Check if str in the header + if 'CHROM' not in vcftools_out_str or 'POS' not in vcftools_out_str: + # Split the line into a list + vcftools_out_data = vcftools_out_str.strip().split('\t') + # Convert the chromStart to int + vcftools_out_data[1] = int(vcftools_out_data[1]) + # Calc chromEnd + chrom_end = vcftools_out_data[1] + 1 + # Add chrom_end to the list + vcftools_out_data = vcftools_out_data + [chrom_end] + # Return the list as a string (with newline element) + return '\t'.join(map(str, vcftools_out_data)) + '\n' + else: + # Remove the header + return '' + +def pipe_vcftools (vcftools_call_args): + ''' + Calls vcftools with pipe output + + The output of this function is the stdout and stderr of vcftools. This + function should only be used if vcftools is being used as the stdin of + another function. Please note that this function does not check the for + errors in the vcftools call. Please check for errors after the call is + closed using check_vcftools_for_errors. + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + + Returns + ------- + vcftools_call : subprocess.Popen + vcftools subprocess call + vcftools_call.stdout : PIPE + vcftools stdout PIPE (Results) + vcftools_call.stderr : PIPE + vcftools stderr PIPE (Log) + + ''' + + # vcftools subprocess call + vcftools_call = subprocess.Popen(['vcftools', '--stdout'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + return vcftools_call + +def pipe_vcftools_to_bed_file (vcftools_call_args, output_filename): + + ''' + Pipes site-file output of vcftools to a bed formmated file + + The purpose of this function is to avoid creating large uncompressed + vcf files by directly piping the output of vcftools to bgzip. This + results in creating a vcf.gz file without any intermediates. + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + output_filename : str + Filename of the bed file + + ''' + # Open vcftools pipe + vcftools_call = pipe_vcftools(vcftools_call_args) + + # Create the bed file + bed_output = open(output_filename, 'w') + + try: + # Iterate the vcftools stdout unless error occurs + for vcftools_stdout_line in iter(vcftools_call.stdout.readline, b''): + bed_output.write(cvt_vcftools_site_to_bed(vcftools_stdout_line)) + # Close the bed file + bed_output.close() + except: + # Close the bed file + bed_output.close() + # Delete the file + os.remove(output_filename) + + # Wait for vctools to finish + vcftools_call.wait() + + # Close the vcftools stdout + vcftools_call.stdout.close() + + # Read the vcftools stderr + vcftools_stderr = vcftools_call.stderr.read() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + vcftools_stderr = vcftools_stderr.decode() + + # Check that the log file was created correctly + check_vcftools_for_errors(vcftools_stderr) + + logging.info('vcftools call complete') + + return vcftools_stderr + +def pipe_vcftools_bgzip (vcftools_call_args, output_filename): + ''' + Pipes the output of vcftools to bgzip + + The purpose of this function is to avoid creating large uncompressed + vcf files by directly piping the output of vcftools to bgzip. This + results in creating a vcf.gz file without any intermediates. + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + output_filename : str + Filename of the compressed vcf file + + ''' + + vcftools_call = pipe_vcftools(vcftools_call_args) + + # Create bgzip output file + bgzip_output = open(output_filename, 'wb') + + # bgzip subprocess call + bgzip_call = subprocess.Popen(['bgzip'], stdin = vcftools_call.stdout, stdout = bgzip_output, stderr = subprocess.PIPE) + + # Wait for vctools to finish + vcftools_call.wait() + + # Close the vcftools stdout + vcftools_call.stdout.close() + + # Read the vcftools stderr + vcftools_stderr = vcftools_call.stderr.read() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + vcftools_stderr = vcftools_stderr.decode() + + # Check that the log file was created correctly + check_vcftools_for_errors(vcftools_stderr) + + # Wait for bgzip to finish + bgzip_call.wait() + + # Close the compressed vcf file + bgzip_output.close() + + # Save the stderr from bgzip, stdout = None + bgzip_stdout, bgzip_stderr = bgzip_call.communicate() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + bgzip_stderr = bgzip_stderr.decode() + + # Check that output file was compressed correctly + check_bgzip_for_errors(bgzip_stderr) + + logging.info('vcftools and bgzip calls complete') + + return vcftools_stderr + +def pipe_vcftools_bcftools (vcftools_call_args, output_filename): + ''' + Pipes the output of vcftools to bcftools + + The purpose of this function is to avoid the vcftools command + --recode-bcf that may result in malformed BCF files. To avoid large + uncompressed intermediates, this function pipes the stdout of vcftools + to bcftools. + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + output_filename : str + Filename of the BCF file + + ''' + + vcftools_call = pipe_vcftools(vcftools_call_args) + + # Holds the arguments to convert to BCF format + convert_args = ['view', '-O', 'b'] + + # Assigns the output file to the arguments + convert_args.extend(['-o', output_filename]) + + # bcftools subprocess call + bcftools_call = subprocess.Popen(['bcftools'] + convert_args, stdin = vcftools_call.stdout, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + + # Wait for vctools to finish + vcftools_call.wait() + + # Close the vcftools stdout + vcftools_call.stdout.close() + + # Read the vcftools stderr + vcftools_stderr = vcftools_call.stderr.read() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + vcftools_stderr = vcftools_stderr.decode() + + # Check that the log file was created correctly + check_vcftools_for_errors(vcftools_stderr) + + # Wait for bgzip to finish + bcftools_call.wait() + + # Save the stderr from bgzip, stdout = None + bcftools_stdout, bcftools_stderr = bcftools_call.communicate() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + bcftools_stderr = bcftools_stderr.decode() + + # Check that output file was compressed correctly + check_bcftools_for_errors(bcftools_stderr) + + logging.info('vcftools and bcftools calls complete') + + return vcftools_stderr + +def pipe_vcftools_to_file (vcftools_call_args, output_filename, append_output = False): + ''' + Pipes file output of vcftools to a standard file + + The function calls vcftools. Returns the stderr of vcftools to + create log file of the call. The function may be used to append multiple + calls to vcftools to a single file + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + append_output : bool + The output file should be written in append mode + + Returns + ------- + vcftools_err : str + vcftools log output + + Raises + ------ + Exception + If vcftools stderr returns an error + ''' + + # Open vcftools pipe + vcftools_call = pipe_vcftools(vcftools_call_args) + + # Check if the output should be opened in append mode + if append_output: + # Create the output file (in append mode) + output_file = open(output_filename, 'a') + else: + # Create the output file (in write mode) + output_file = open(output_filename, 'w') + + + try: + # Create iterator of the vcftools stdout + stdout_iter = iter(vcftools_call.stdout.readline, b'') + + # Check if the output is being appended and the file is empty + if append_output and os.stat(output_filename).st_size != 0: + # Skip the header if the file isn't empty and appending + next(stdout_iter) + + # Iterate the vcftools stdout + for vcftools_stdout_line in stdout_iter: + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + vcftools_stdout_line = vcftools_stdout_line.decode() + + output_file.write(vcftools_stdout_line) + + # Close the bed file + output_file.close() + + except: + # Close the bed file + output_file.close() + # Delete the file + os.remove(output_filename) + + raise Exception('vcftools to python pipe error') + + # Wait for vctools to finish + vcftools_call.wait() + + # Close the vcftools stdout + vcftools_call.stdout.close() + + # Read the vcftools stderr + vcftools_stderr = vcftools_call.stderr.read() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + vcftools_stderr = vcftools_stderr.decode() + + # Check that the log file was created correctly + check_vcftools_for_errors(vcftools_stderr) + + logging.info('vcftools call complete') + + return vcftools_stderr + +def standard_vcftools_call (vcftools_call_args): + ''' + Calls vcftools + + The function calls vcftools. Returns the stderr of vcftools to + create log file of the call. + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + + Returns + ------- + vcftools_out : str + vcftools call output + vcftools_err : str + vcftools log output + + Raises + ------ + Exception + If vcftools stderr returns an error + ''' + + # vcftools subprocess call without stdout + vcftools_call = subprocess.Popen(['vcftools'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Wait for vcftools to finish + vcftools_stdout, vcftools_stderr = vcftools_call.communicate() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + vcftools_stderr = vcftools_stderr.decode() + + logging.info('vcftools call complete') + + # Check that the log file was created correctly + check_vcftools_for_errors(vcftools_stderr) + + return vcftools_stderr + +def call_vcftools (vcftools_call_args, output_format = None, output_filename = None): + ''' + Calls vcftools + + The function calls vcftools. Returns the stderr of vcftools to + create log file of the call. + + Parameters + ---------- + vcftools_call_args : list + vcftools arguments + output_format : str + The output format + output_filename : str + The output filename assigned by vcftools (for piped calls) + + Returns + ------- + vcftools_out : str + vcftools call output + vcftools_err : str + vcftools log output + + Raises + ------ + Exception + If vcftools stderr returns an error + ''' + + # Check if the output is a bgzipped vcf + if output_format == 'vcf.gz': + # Pipe vcftools stdout to bgzip to create a bgzipped vcf + vcftools_err = pipe_vcftools_bgzip(vcftools_call_args, output_filename) + # Check if the output is a bcf + elif output_format == 'bcf': + # Pipe vcftools stdout to bgzip to create a bgzipped vcf + vcftools_err = pipe_vcftools_bcftools(vcftools_call_args, output_filename) + elif output_format == 'removed_bed' or output_format == 'kept_bed': + # Pipe vcftools stdout to bed file + vcftools_err = pipe_vcftools_to_bed_file(vcftools_call_args, output_filename) + elif output_format == 'het-fis': + vcftools_err = pipe_vcftools_to_file(vcftools_call_args, output_filename, append_output = True) + else: + # Call vcftools under standard conditions + vcftools_err = standard_vcftools_call(vcftools_call_args) + + # Return the log + return vcftools_err + +def check_for_vcftools_output (vcftools_output): + ''' + Checks for the previous vcftools output + + Confirms that neither a previous vcftools log or output file exists. + + Parameters + ---------- + vcftools_output : str + Specifies the output filename to be checked + + Raises + ------ + IOError + If the vcftools output file exists + IOError + If the vcftools log file exists + + ''' + # Check if output file already exists + if os.path.isfile(vcftools_output): + raise IOError('VCF output file already exists') + + logging.info('Output file assigned') + + # Check if log file already exists + if os.path.isfile(vcftools_output + '.log'): + raise IOError('Log file already exists') + + logging.info('Log file assigned') + +def delete_vcftools_output (vcftools_output): + ''' + Deletes previous vcftools output + + Confirms if previous vcftools output exists, and if so, deletes it + + Parameters + ---------- + vcftools_output : str + Specifies the output filename to be deleted + + Raises + ------ + IOError + If the vcftools output cannot be deleted + IOError + If the vcftools log cannot be deleted + ''' + + # Check if output file already exists + if os.path.isfile(vcftools_output): + try: + # Delete the output + os.remove(vcftools_output) + except: + raise IOError('VCF output file cannot be deleted') + + logging.info('Output file assigned') + + # Check if log file already exists + if os.path.isfile(vcftools_output + '.log'): + try: + # Delete the output + os.remove(vcftools_output + '.log') + except: + raise IOError('Log file cannot be deleted') + + logging.info('Log file assigned') + +def check_vcftools_for_errors (vcftools_stderr): + ''' + Checks the vcftools stderr for errors + + Parameters + ---------- + vcftools_stderr : str + vcftools stderr + + Raises + ------ + IOError + If vcftools stderr returns an error + ''' + + # Returns True if the job completed without error + if 'Run Time' in str(vcftools_stderr): + pass + + # Print output for vcftools if error is detected + elif 'Error' in str(vcftools_stderr): + # Splits log into list of lines + vcftools_stderr_lines = vcftools_stderr.splitlines() + # Prints the error(s) + raise Exception('\n'.join((output_line for output_line in vcftools_stderr_lines if output_line.startswith('Error')))) + + # Print output if not completed and no error found. Unlikely to be used, but included. + else: + raise Exception(vcftools_stderr) + +def produce_vcftools_output (output, filename, append_mode = False, strip_header = False): + ''' + Creates the vcftools output file + + This function will create an output file from the vcftools stdout. + Please run `check_vcftools_for_errors` prior to check that vcftools + finished without error. + + Parameters + ---------- + output : str + vcftools stdout + filename : str + Specifies the filename for the output file + append_mode : bool + Used to create a single output file from multiple calls + strip_header : bool + Used to remove the header if not needed + + Returns + ------- + output : file + vcftools output file + + ''' + + # Check if the header should be stripped + if strip_header: + output = ''.join(output.splitlines(True)[1:]) + + # Check if single log file is required from multiple calls + if append_mode: + vcftools_log_file = open(filename,'a') + else: + vcftools_log_file = open(filename,'w') + + vcftools_log_file.write(str(output)) + vcftools_log_file.close() + +def produce_vcftools_log (output, filename, append_mode = False): + ''' + Creates the vcftools log file + + This function will create a log file from the vcftools stderr. Please + run `check_vcftools_for_errors` prior to check that vcftools finished + without error. + + Parameters + ---------- + output : str + vcftools stderr + filename : str + Specifies the filename for the log file + append_mode : bool + Used to create a single log file from multiple calls + + Returns + ------- + output : file + vcftools log file + + ''' + # Check if single log file is required from multiple calls + if append_mode: + vcftools_log_file = open(filename + '.log','a') + else: + vcftools_log_file = open(filename + '.log','w') + + vcftools_log_file.write(str(output)) + vcftools_log_file.close() + +def assign_vcftools_input_arg (filename): + ''' + Confirms file format for vcftools + + Parameters + ---------- + filename : str + Specifies the input filename of unknown format + + Returns + ------- + list + Returns vcftools input command for `filename` + + Raises + ------ + IOError + If filename is an unknown file format + ''' + + # True if file extensions is recognized by vcftools + if filename.endswith('.vcf') or filename.endswith('.vcf.gz') or filename.endswith('.bcf'): + # Assign the associated input command + if filename.endswith('.vcf'): + return ['--vcf', filename] + elif filename.endswith('.vcf.gz'): + return ['--gzvcf', filename] + elif filename.endswith('.bcf'): + return ['--bcf', filename] + + # True if file extension is unknown or not recognized + else: + + # Checks if the file is unzipped, bgzipped, or gzipped + vcfname_format = checkFormat(filename) + + # Assign the associated input command, or return an error. + if vcfname_format == 'vcf': + return ['--vcf', filename] + elif vcfname_format == 'bgzip': + return ['--gzvcf', filename] + elif vcfname_format == 'bcf': + return ['--bcf', filename] + else: + raise Exception('Unknown VCF file format')