view vcftools.py @ 3:d1e3db7f6521 draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:28:38 -0400
parents
children
line wrap: on
line source

import os
import sys
import logging
import subprocess

sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))

from vcf_reader_func import checkFormat
from bcftools import check_bcftools_for_errors

def check_bgzip_for_errors (bgzip_stderr):
    '''
        Checks the bgzip stderr for errors

        Parameters
        ----------
        bgzip_stderr : str
            bgzip stderr

        Raises
        ------
        IOError
            If bgzip stderr returns an error
    '''

    if bgzip_stderr:
        raise IOError('Error occured while compressing the vcf file')

def bgzip_decompress_vcfgz (vcfgz_filename, out_prefix = '', keep_original = False):
        '''
            Converts a vcf.gz to vcf

            The function automates bgzip to decompress a vcf.gz file into a vcf

            Parameters
            ----------
            vcfgz_filename : str
                The file name of the vcf.gz file to be decompressed
            out_prefix : str
                Output file prefix (i.e. filename without extension)
            keep_original : bool
                Specifies if the original file should be kept

            Raises
            ------
            IOError
                Error in creating the compressed file
        '''

        # Run bgzip with stdout piped to file
        if keep_original or out_prefix:

            if out_prefix:

                # Assign the bgzip filename
                vcf_filename = out_prefix + '.vcf'

            else:

                # Seperate into path and filename
                split_path, split_filename = os.path.split(vcfgz_filename)

                # Remove any file extensions
                vcf_basename = split_filename.split(os.extsep)[0] + '.vcf'

                # Join path and filename
                vcf_filename = os.path.join(split_path, vcf_basename)

            # Create the output file
            vcf_file = open(vcf_filename, 'w')

            # bgzip subprocess call
            bgzip_call = subprocess.Popen(['bgzip', '-dc', vcfgz_filename], stdout = vcf_file, stderr = subprocess.PIPE)

        # Run bgzip normally
        else:

            # bgzip subprocess call
            bgzip_call = subprocess.Popen(['bgzip', '-d', vcfgz_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)

        # Save the stdout and stderr from bgzip
        bgzip_out, bgzip_err = bgzip_call.communicate()

        # Check that output file was compressed correctly
        check_bgzip_for_errors(bgzip_err)

        # Delete input when also using an output prefix
        if out_prefix and not keep_original:
            os.remove(vcfgz_filename)

def bgzip_compress_vcf (vcf_filename, out_prefix = '', keep_original = False):
        '''
            Converts a vcf to vcf.gz

            The function automates bgzip to compress a vcf file into a vcf.gz

            Parameters
            ----------
            vcf_filename : str
                The file name of the vcf file to be compressed
            keep_original : bool
                Specifies if the original file should be kept

            Raises
            ------
            IOError
                Error in creating the compressed file
        '''

        # Compress and keep the original file
        if keep_original or out_prefix:

            if out_prefix:

                # Assign the filename
                vcfgz_filename = out_prefix + '.vcf.gz'

            else:

                # Seperate into path and filename
                split_path, split_filename = os.path.split(vcfgz_filename)

                # Remove any file extensions
                vcfgz_basename = split_filename.split(os.extsep)[0] + '.vcf.gz'

                # Join path and filename
                vcfgz_filename = os.path.join(split_path, vcfgz_basename)


            # Create the output file
            vcfgz_file = open(vcfgz_filename, 'w')

            # bgzip subprocess call
            bgzip_call = subprocess.Popen(['bgzip', '-c', vcf_filename], stdout = vcfgz_file, stderr = subprocess.PIPE)

        # Compress and do not keep the original file
        else:

            # bgzip subprocess call
            bgzip_call = subprocess.Popen(['bgzip', vcf_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)

        # Save the stdout and stderr from bgzip
        bgzip_out, bgzip_err = bgzip_call.communicate()

        # Check that output file was compressed correctly
        check_bgzip_for_errors(bgzip_err)

def cvt_vcftools_site_to_bed (vcftools_out_str):
    # Check if str in the header
    if 'CHROM' not in vcftools_out_str or 'POS' not in vcftools_out_str:
        # Split the line into a list
        vcftools_out_data = vcftools_out_str.strip().split('\t')
        # Convert the chromStart to int
        vcftools_out_data[1] = int(vcftools_out_data[1])
        # Calc chromEnd
        chrom_end = vcftools_out_data[1] + 1
        # Add chrom_end to the list
        vcftools_out_data = vcftools_out_data + [chrom_end]
        # Return the list as a string (with newline element)
        return '\t'.join(map(str, vcftools_out_data)) + '\n'
    else:
        # Remove the header
        return ''

def pipe_vcftools (vcftools_call_args):
    '''
        Calls vcftools with pipe output

        The output of this function is the stdout and stderr of vcftools. This
        function should only be used if vcftools is being used as the stdin of
        another function. Please note that this function does not check the for
        errors in the vcftools call. Please check for errors after the call is
        closed using check_vcftools_for_errors.

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments

        Returns
        -------
        vcftools_call : subprocess.Popen
            vcftools subprocess call
        vcftools_call.stdout : PIPE
            vcftools stdout PIPE (Results)
        vcftools_call.stderr : PIPE
            vcftools stderr PIPE (Log)

    '''

    # vcftools subprocess call
    vcftools_call = subprocess.Popen(['vcftools', '--stdout'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    return vcftools_call

def pipe_vcftools_to_bed_file (vcftools_call_args, output_filename):

    '''
        Pipes site-file output of vcftools to a bed formmated file

        The purpose of this function is to avoid creating large uncompressed
        vcf files by directly piping the output of vcftools to bgzip. This
        results in creating a vcf.gz file without any intermediates.

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments
        output_filename : str
            Filename of the bed file

    '''
    # Open vcftools pipe
    vcftools_call = pipe_vcftools(vcftools_call_args)

    # Create the bed file
    bed_output = open(output_filename, 'w')

    try:
        # Iterate the vcftools stdout unless error occurs
        for vcftools_stdout_line in iter(vcftools_call.stdout.readline, b''):
            bed_output.write(cvt_vcftools_site_to_bed(vcftools_stdout_line))
        # Close the bed file
        bed_output.close()
    except:
        # Close the bed file
        bed_output.close()
        # Delete the file
        os.remove(output_filename)

    # Wait for vctools to finish
    vcftools_call.wait()

    # Close the vcftools stdout
    vcftools_call.stdout.close()

    # Read the vcftools stderr
    vcftools_stderr = vcftools_call.stderr.read()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        vcftools_stderr = vcftools_stderr.decode()

    # Check that the log file was created correctly
    check_vcftools_for_errors(vcftools_stderr)

    logging.info('vcftools call complete')

    return vcftools_stderr

def pipe_vcftools_bgzip (vcftools_call_args, output_filename):
    '''
        Pipes the output of vcftools to bgzip

        The purpose of this function is to avoid creating large uncompressed
        vcf files by directly piping the output of vcftools to bgzip. This
        results in creating a vcf.gz file without any intermediates.

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments
        output_filename : str
            Filename of the compressed vcf file

    '''

    vcftools_call = pipe_vcftools(vcftools_call_args)

    # Create bgzip output file
    bgzip_output = open(output_filename, 'wb')

    # bgzip subprocess call
    bgzip_call = subprocess.Popen(['bgzip'], stdin = vcftools_call.stdout, stdout = bgzip_output, stderr = subprocess.PIPE)

    # Wait for vctools to finish
    vcftools_call.wait()

    # Close the vcftools stdout
    vcftools_call.stdout.close()

    # Read the vcftools stderr
    vcftools_stderr = vcftools_call.stderr.read()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        vcftools_stderr = vcftools_stderr.decode()

    # Check that the log file was created correctly
    check_vcftools_for_errors(vcftools_stderr)

    # Wait for bgzip to finish
    bgzip_call.wait()

    # Close the compressed vcf file
    bgzip_output.close()

    # Save the stderr from bgzip, stdout = None
    bgzip_stdout, bgzip_stderr = bgzip_call.communicate()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        bgzip_stderr = bgzip_stderr.decode()

    # Check that output file was compressed correctly
    check_bgzip_for_errors(bgzip_stderr)

    logging.info('vcftools and bgzip calls complete')

    return vcftools_stderr

def pipe_vcftools_bcftools (vcftools_call_args, output_filename):
    '''
        Pipes the output of vcftools to bcftools

        The purpose of this function is to avoid the vcftools command
        --recode-bcf that may result in malformed BCF files. To avoid large
        uncompressed intermediates, this function pipes the stdout of vcftools
        to bcftools.

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments
        output_filename : str
            Filename of the BCF file

    '''

    vcftools_call = pipe_vcftools(vcftools_call_args)

    # Holds the arguments to convert to BCF format
    convert_args = ['view', '-O', 'b']

    # Assigns the output file to the arguments
    convert_args.extend(['-o', output_filename])

    # bcftools subprocess call
    bcftools_call = subprocess.Popen(['bcftools'] + convert_args, stdin = vcftools_call.stdout, stdout = subprocess.PIPE, stderr = subprocess.PIPE)

    # Wait for vctools to finish
    vcftools_call.wait()

    # Close the vcftools stdout
    vcftools_call.stdout.close()

    # Read the vcftools stderr
    vcftools_stderr = vcftools_call.stderr.read()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        vcftools_stderr = vcftools_stderr.decode()

    # Check that the log file was created correctly
    check_vcftools_for_errors(vcftools_stderr)

    # Wait for bgzip to finish
    bcftools_call.wait()

    # Save the stderr from bgzip, stdout = None
    bcftools_stdout, bcftools_stderr = bcftools_call.communicate()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        bcftools_stderr = bcftools_stderr.decode()

    # Check that output file was compressed correctly
    check_bcftools_for_errors(bcftools_stderr)

    logging.info('vcftools and bcftools calls complete')

    return vcftools_stderr

def pipe_vcftools_to_file (vcftools_call_args, output_filename, append_output = False):
    '''
        Pipes file output of vcftools to a standard file

        The function calls vcftools. Returns the stderr of vcftools to
        create log file of the call. The function may be used to append multiple
        calls to vcftools to a single file

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments
        append_output : bool
            The output file should be written in append mode

        Returns
        -------
        vcftools_err : str
            vcftools log output

        Raises
        ------
        Exception
            If vcftools stderr returns an error
    '''

    # Open vcftools pipe
    vcftools_call = pipe_vcftools(vcftools_call_args)

    # Check if the output should be opened in append mode
    if append_output:
        # Create the output file (in append mode)
        output_file = open(output_filename, 'a')
    else:
        # Create the output file (in write mode)
        output_file = open(output_filename, 'w')


    try:
        # Create iterator of the vcftools stdout
        stdout_iter = iter(vcftools_call.stdout.readline, b'')

        # Check if the output is being appended and the file is empty
        if append_output and os.stat(output_filename).st_size != 0:
            # Skip the header if the file isn't empty and appending
            next(stdout_iter)

        # Iterate the vcftools stdout
        for vcftools_stdout_line in stdout_iter:

            # Check if code is running in python 3
            if sys.version_info[0] == 3:
                # Convert bytes to string
                vcftools_stdout_line = vcftools_stdout_line.decode()

            output_file.write(vcftools_stdout_line)

        # Close the bed file
        output_file.close()

    except:
        # Close the bed file
        output_file.close()
        # Delete the file
        os.remove(output_filename)

        raise Exception('vcftools to python pipe error')

    # Wait for vctools to finish
    vcftools_call.wait()

    # Close the vcftools stdout
    vcftools_call.stdout.close()

    # Read the vcftools stderr
    vcftools_stderr = vcftools_call.stderr.read()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        vcftools_stderr = vcftools_stderr.decode()

    # Check that the log file was created correctly
    check_vcftools_for_errors(vcftools_stderr)

    logging.info('vcftools call complete')

    return vcftools_stderr

def standard_vcftools_call (vcftools_call_args):
    '''
        Calls vcftools

        The function calls vcftools. Returns the stderr of vcftools to
        create log file of the call.

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments

        Returns
        -------
        vcftools_out : str
            vcftools call output
        vcftools_err : str
            vcftools log output

        Raises
        ------
        Exception
            If vcftools stderr returns an error
    '''

    # vcftools subprocess call without stdout
    vcftools_call = subprocess.Popen(['vcftools'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Wait for vcftools to finish
    vcftools_stdout, vcftools_stderr = vcftools_call.communicate()

    # Check if code is running in python 3
    if sys.version_info[0] == 3:
        # Convert bytes to string
        vcftools_stderr = vcftools_stderr.decode()

    logging.info('vcftools call complete')

    # Check that the log file was created correctly
    check_vcftools_for_errors(vcftools_stderr)

    return vcftools_stderr

def call_vcftools (vcftools_call_args, output_format = None, output_filename = None):
    '''
        Calls vcftools

        The function calls vcftools. Returns the stderr of vcftools to
        create log file of the call.

        Parameters
        ----------
        vcftools_call_args : list
            vcftools arguments
        output_format : str
            The output format
        output_filename : str
            The output filename assigned by vcftools (for piped calls)

        Returns
        -------
        vcftools_out : str
            vcftools call output
        vcftools_err : str
            vcftools log output

        Raises
        ------
        Exception
            If vcftools stderr returns an error
    '''

    # Check if the output is a bgzipped vcf
    if output_format == 'vcf.gz':
        # Pipe vcftools stdout to bgzip to create a bgzipped vcf
        vcftools_err = pipe_vcftools_bgzip(vcftools_call_args, output_filename)
    # Check if the output is a bcf
    elif output_format == 'bcf':
        # Pipe vcftools stdout to bgzip to create a bgzipped vcf
        vcftools_err = pipe_vcftools_bcftools(vcftools_call_args, output_filename)
    elif output_format == 'removed_bed' or output_format == 'kept_bed':
        # Pipe vcftools stdout to bed file
        vcftools_err = pipe_vcftools_to_bed_file(vcftools_call_args, output_filename)
    elif output_format == 'het-fis':
        vcftools_err = pipe_vcftools_to_file(vcftools_call_args, output_filename, append_output = True)
    else:
        # Call vcftools under standard conditions
        vcftools_err = standard_vcftools_call(vcftools_call_args)

    # Return the log
    return vcftools_err

def check_for_vcftools_output (vcftools_output):
    '''
        Checks for the previous vcftools output

        Confirms that neither a previous vcftools log or output file exists.

        Parameters
        ----------
        vcftools_output : str
            Specifies the output filename to be checked

        Raises
        ------
        IOError
            If the vcftools output file exists
        IOError
            If the vcftools log file exists

    '''
    # Check if output file already exists
    if os.path.isfile(vcftools_output):
        raise IOError('VCF output file already exists')

    logging.info('Output file assigned')

    # Check if log file already exists
    if os.path.isfile(vcftools_output + '.log'):
        raise IOError('Log file already exists')

    logging.info('Log file assigned')

def delete_vcftools_output (vcftools_output):
    '''
        Deletes previous vcftools output

        Confirms if previous vcftools output exists, and if so, deletes it

        Parameters
        ----------
        vcftools_output : str
            Specifies the output filename to be deleted

        Raises
        ------
        IOError
            If the vcftools output cannot be deleted
        IOError
            If the vcftools log cannot be deleted
    '''

    # Check if output file already exists
    if os.path.isfile(vcftools_output):
        try:
            # Delete the output
            os.remove(vcftools_output)
        except:
            raise IOError('VCF output file cannot be deleted')

    logging.info('Output file assigned')

    # Check if log file already exists
    if os.path.isfile(vcftools_output + '.log'):
        try:
            # Delete the output
            os.remove(vcftools_output + '.log')
        except:
            raise IOError('Log file cannot be deleted')

    logging.info('Log file assigned')

def check_vcftools_for_errors (vcftools_stderr):
    '''
        Checks the vcftools stderr for errors

        Parameters
        ----------
        vcftools_stderr : str
            vcftools stderr

        Raises
        ------
        IOError
            If vcftools stderr returns an error
    '''

    # Returns True if the job completed without error
    if 'Run Time' in str(vcftools_stderr):
        pass

    # Print output for vcftools if error is detected
    elif 'Error' in str(vcftools_stderr):
        # Splits log into list of lines
        vcftools_stderr_lines = vcftools_stderr.splitlines()
        # Prints the error(s)
        raise Exception('\n'.join((output_line for output_line in vcftools_stderr_lines if output_line.startswith('Error'))))

    # Print output if not completed and no error found. Unlikely to be used, but included.
    else:
        raise Exception(vcftools_stderr)

def produce_vcftools_output (output, filename, append_mode = False, strip_header = False):
    '''
        Creates the vcftools output file

        This function will create an output file from the vcftools stdout.
        Please run `check_vcftools_for_errors` prior to check that vcftools
        finished without error.

        Parameters
        ----------
        output : str
            vcftools stdout
        filename : str
            Specifies the filename for the output file
        append_mode : bool
            Used to create a single output file from multiple calls
        strip_header : bool
            Used to remove the header if not needed

        Returns
        -------
        output : file
            vcftools output file

    '''

    # Check if the header should be stripped
    if strip_header:
        output = ''.join(output.splitlines(True)[1:])

    # Check if single log file is required from multiple calls
    if append_mode:
        vcftools_log_file = open(filename,'a')
    else:
        vcftools_log_file = open(filename,'w')

    vcftools_log_file.write(str(output))
    vcftools_log_file.close()

def produce_vcftools_log (output, filename, append_mode = False):
    '''
        Creates the vcftools log file

        This function will create a log file from the vcftools stderr. Please
        run `check_vcftools_for_errors` prior to check that vcftools finished
        without error.

        Parameters
        ----------
        output : str
            vcftools stderr
        filename : str
            Specifies the filename for the log file
        append_mode : bool
            Used to create a single log file from multiple calls

        Returns
        -------
        output : file
            vcftools log file

    '''
    # Check if single log file is required from multiple calls
    if append_mode:
        vcftools_log_file = open(filename + '.log','a')
    else:
        vcftools_log_file = open(filename + '.log','w')

    vcftools_log_file.write(str(output))
    vcftools_log_file.close()

def assign_vcftools_input_arg (filename):
    '''
        Confirms file format for vcftools

        Parameters
        ----------
        filename : str
            Specifies the input filename of unknown format

        Returns
        -------
        list
            Returns vcftools input command for `filename`

        Raises
        ------
        IOError
            If filename is an unknown file format
    '''

    # True if file extensions is recognized by vcftools
    if filename.endswith('.vcf') or filename.endswith('.vcf.gz') or filename.endswith('.bcf'):
        # Assign the associated input command
        if filename.endswith('.vcf'):
            return ['--vcf', filename]
        elif filename.endswith('.vcf.gz'):
            return ['--gzvcf', filename]
        elif filename.endswith('.bcf'):
            return ['--bcf', filename]

    # True if file extension is unknown or not recognized
    else:

        # Checks if the file is unzipped, bgzipped, or gzipped
        vcfname_format = checkFormat(filename)

        # Assign the associated input command, or return an error.
        if vcfname_format == 'vcf':
            return ['--vcf', filename]
        elif vcfname_format == 'bgzip':
            return ['--gzvcf', filename]
        elif vcfname_format == 'bcf':
            return ['--bcf', filename]
        else:
            raise Exception('Unknown VCF file format')