view vcf_phase.py @ 3:d1e3db7f6521 draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:28:38 -0400
parents 3830d29fca6a
children
line wrap: on
line source

import os
import sys
import copy
import shutil
import argparse
import glob
import logging

sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

from vcf_reader_func import checkFormat
from logging_module import initLogger, logArgs
from model import read_model_file
from beagle import call_beagle
from shapeit import call_shapeit, remove_intermediate_files
from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate

def phase_argument_parser(passed_arguments):
    '''Phase Argument Parser - Assigns arguments for vcftools from command line.
    Depending on the argument in question, a default value may be specified'''

    def parser_confirm_file ():
        '''Custom action to confirm file exists'''
        class customAction(argparse.Action):
            def __call__(self, parser, args, value, option_string=None):
                if not os.path.isfile(value):
                    raise IOError('%s not found' % value)
                setattr(args, self.dest, value)
        return customAction

    def metavar_list (var_list):
        '''Create a formmated metavar list for the help output'''
        return '{' + ', '.join(var_list) + '}'

    phase_parser = argparse.ArgumentParser()

    # Input arguments
    phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file())

    # Model file arguments
    phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file())
    phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str)

    # General arguments
    phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true')
    phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/')

    # Phase algorithm argument
    phasing_list = ['beagle', 'shapeit']
    phasing_default = 'beagle'
    phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default)

    # Common phasing arguments
    phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int)
    phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int)
    phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file())
    phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str)
    phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int)
    phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int)

    # Shapeit-specific options
    phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int)
    phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int)
    phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int)
    phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int)
    phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float)

    # Beagle-specific options
    phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int)
    phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int)
    phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int)
    phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float)
    phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float)
    phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float)
    phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float)
    phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int)

    # Output arguments
    phase_parser.add_argument('--out', help = 'Defines the output filename')
    phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out')

    out_format_list = ['vcf', 'vcf.gz', 'bcf']
    out_format_default = 'vcf.gz'

    phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default)

    if passed_arguments:
        return phase_parser.parse_args(passed_arguments)
    else:
        return phase_parser.parse_args()

def concatenate_logs (log_files, new_log_filename):
    '''
        Concatenates log files

        This function is used to concatenate log files. If a single chromosome
        is being analyzed, the function is used to concatenate the log files
        from multiple wrappers. If multiple chromosomes are being analyzed the
        function is used to concatenate the logs of each chromosome.

        Raises
        ------
        IOError
            If a log file does not exist
    '''

    # Open the new log file
    with open(new_log_filename, 'wb') as out_file:
        # Loop the logs to be combined
        for log_file in log_files:
            # Confirm the log file exists, raise an error if not
            if not os.path.isfile(log_file):
                raise IOError('%s not found' % log_file)
            # Open the log file
            with open(log_file, 'rb') as in_file:
                # Copy the contents of the file
                shutil.copyfileobj(in_file, out_file)

    # Remove old log files
    for log_file in log_files:
        os.remove(log_file)

def assign_vcf_extension (filename):
    # Checks if the file is unzipped, bgzipped, or gzipped
    vcfname_format = checkFormat(filename)

    if vcfname_format == 'vcf':
        return '.vcf'

    elif vcfname_format == 'gzip' or vcfname_format == 'bgzip':
        return '.vcf.gz'

    elif vcfname_format == 'bcf':
        raise Exception('BCF not supported in current version')

    else:
        raise Exception('Unknown file format')

def run (passed_arguments = []):
    '''
        Phaser for VCF files.

        Automates the phasing process for a specified VCF file. The function
        allows users to select between multiple phasing algorithms: beagle
        (default) and shapit.

        Parameters
        ----------
        --vcf : str
            Specifies the input VCF filename
        --phase-algorithm : str
            Specifies the algorithm to be used. Choices: beagle (default) and
            shapit
        --out : str
            Specifies the output filename

        Returns
        -------
        output : file
            Phased VCF file

        Raises
        ------
        IOError
            Input VCF file does not exist
        IOError
            Output file already exists

    '''

    # Grab VCF arguments from command line
    phase_args = phase_argument_parser(passed_arguments)

    # Adds the arguments (i.e. parameters) to the log file
    logArgs(phase_args, func_name = 'vcf_phase')

    # Assign file extension for VCF input file
    vcfname_ext = assign_vcf_extension(phase_args.vcf)

    logging.info('Input file assigned')

    # Used to confirm if the VCF input was renamed
    vcfname_renamed = False

    # Confirm input has correct file extension
    if vcfname_ext not in phase_args.vcf:
        vcfname_renamed = True
        os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext)
        phase_args.vcf += vcfname_ext

    # List to hold beagle/shapeit arguments
    phase_call_args = []

    # bool to check if an eclude file was created
    exclude_file_created = False

    # bool to check if an include file was created
    include_file_created = False

    # Performs actions related to --model-file and argument assignment
    if phase_args.model_file:

        # Check if a model has been specified
        if not phase_args.model:
            raise IOError('No selected model. Please use --model to select a model')

        # Read in the model file
        models_file = read_model_file(phase_args.model_file)

        # Check that the selected model was found in the file
        if phase_args.model not in models_file:
            raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file))

        logging.info('Model assigned')

        if phase_args.phase_algorithm == 'beagle':

            # Get list of individuals to include
            inds_in_model = models_file[phase_args.model].inds

            # Create file with individuals to exclude (beagle has no include option)
            models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite)

            # Assign exclude file
            phase_call_args.append('excludesamples=' + models_file.exclude_file)

            # Confirm that the exclude file was created
            exclude_file_created = True

        elif phase_args.phase_algorithm == 'shapeit':

            # Assign the model
            selected_model = models_file[phase_args.model]

            # Create file with individuals to exclude (beagle has no include option)
            selected_model.create_ind_file(overwrite = phase_args.overwrite)

            # Assign exclude file
            phase_call_args.extend(['--include-ind', selected_model.ind_file])

            # Confirm that the include file was created
            include_file_created = True

    # Get the list of chromosomes within the VCF
    chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf)

    # Check if the user specified a specific chromosome
    if phase_args.phase_chr:
        # Check if the chromosome is within the file
        if phase_args.phase_chr not in chrs_in_vcf:
            raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf))

    # Check that a chr was specified if a bp-based argument was specified
    if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr:
        # Check if the file has a single chromosome
        if len(chrs_in_vcf) == 1:
            # Check if the beagle algorithm is being used
            if phase_args.phase_algorithm == 'beagle':
                # Assign the chromosome, as beagle requires it to be assigned
                phase_args.phase_chr = chrs_in_vcf[0]

                logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0])
        # Report error if multiple chromosomes are within the file
        else:
            raise Exception('The --phase-from-bp and --phase-to-bp arguments '
                            'require the --phase-chr argument to function if '
                            'multiple chrs are within %s' % phase_args.vcf)

    # Assign general arguments and call beagle
    if phase_args.phase_algorithm == 'beagle':

        # Assigns the input and output arguments
        phase_call_args.extend(['gt=' + phase_args.vcf,
                                'out=' + phase_args.out_prefix])

        # Assign the burn-in iter, if specified
        if phase_args.beagle_burn_iter:
            phase_call_args.append('burnin=' + phase_args.beagle_burn_iter)

        # Assign the iter, if specified
        if phase_args.beagle_iter:
            phase_call_args.append('iterations=' + phase_args.beagle_iter)

        # Assign the state count, if specified
        if phase_args.beagle_states:
            phase_call_args.append('phase-states=' + phase_args.beagle_states)

        # Assign the window length, if specified
        if phase_args.beagle_window:
            phase_call_args.append('window=' + phase_args.beagle_window)

        # Assign the window overlap, if specified
        if phase_args.beagle_overlap:
            phase_call_args.append('overlap=' + phase_args.beagle_overlap)

        # Assign the HMM error, if specified
        if phase_args.beagle_error:
            phase_call_args.append('err=' + phase_args.beagle_error)

        # Assign the step length, if specified
        if phase_args.beagle_step:
            phase_call_args.append('step=' + phase_args.beagle_step)

        # Assign the number of steps, if specified
        if phase_args.beagle_nsteps:
            phase_call_args.append('nsteps=' + phase_args.beagle_nsteps)

        # Assign the genetic map, if specified
        if phase_args.genetic_map:
            phase_call_args.append('map=' + phase_args.genetic_map)

        # Assign the effective pop size, if specified
        if phase_args.Ne:
            phase_call_args.append('ne=' + phase_args.Ne)

        # Assign the random seed, if specified
        if phase_args.random_seed:
            phase_call_args.append('seed=' + phase_args.random_seed)

        # Assign the chromosome to phase, if specified
        if phase_args.phase_chr:

            # Store the chromosome argument
            chr_arg = 'chrom=%s' % phase_args.phase_chr

            # Check if either bp position arguments were specified
            if phase_args.phase_from_bp or phase_args.phase_to_bp:

                # List of the position arguments, in their required order
                position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp]

                # Filter the position arguments to remove empty values
                filttered_position_args = filter(None, position_args)

                # Map the arguments to str and add them to the chromosome argument
                chr_arg += ''.join(map(str, filttered_position_args))

            # Add the chromosome argument the argument list
            phase_call_args.append(chr_arg)

        # Call beagle wrapper
        call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)

        # Assign expected phased output filename
        phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)

        # Rename output to phase_args.out, if specified
        if phase_args.out:
            shutil.move(phased_output, phase_args.out)
            shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log')
        # Rename log using phased_output
        else:
            shutil.move(phase_args.out_prefix + '.log', phased_output + '.log')

        logging.info('beagle log file created')

    # Assign general arguments and call shapeit
    elif phase_args.phase_algorithm == 'shapeit':

        # Assign the shapeit burn in iter, if specified
        if phase_args.shapeit_burn_iter:
            phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter])

        # Assign the shapeit prune iter, if specified
        if phase_args.shapeit_prune_iter:
            phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter])

        # Assign the shapeit main iter, if specified
        if phase_args.shapeit_main_iter:
            phase_call_args.extend(['--main', phase_args.shapeit_main_iter])

        # Assign the number of shapeit states if specified
        if phase_args.shapeit_states:
            phase_call_args.extend(['--states', phase_args.shapeit_states])

        # Assign the shapeit window size, if specified
        if phase_args.shapeit_window:
            phase_call_args.extend(['--window', phase_args.shapeit_window])

        # Assign the genetic map, if specified
        if phase_args.genetic_map:
            phase_call_args.extend(['--input-map', phase_args.genetic_map])

        # Assign the effective pop size, if specified
        if phase_args.Ne:
            phase_call_args.extend(['--effective-size', phase_args.Ne])

        # Assign the random seed, if specified
        if phase_args.random_seed:
            phase_call_args.extend(['--seed', phase_args.random_seed])

        # Check if only a single shapeit run is required
        if phase_args.phase_chr or len(chrs_in_vcf) == 1:

            # Holds shapeit input filename, as a temp file may be required
            shapeit_input_vcf = None

            # Check if a single chromosome is selected
            if phase_args.phase_chr and len(chrs_in_vcf) > 1:
                # Assign the chromosome-specific input prefix
                chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr

                # Assign the chr-specific file as the shapeit input filename
                shapeit_input_vcf = chr_input_prefix + '.vcf.gz'

                # Create the chromosome-specific input
                chr_subset_file(phase_args.vcf,
                                phase_args.phase_chr,
                                chr_input_prefix,
                                'vcf.gz',
                                from_bp = phase_args.phase_from_bp,
                                to_bp = phase_args.phase_to_bp)

                logging.info('Chr %s subset created' % phase_args.phase_chr)

            else:
                # Assign the shapeit input filename
                shapeit_input_vcf = phase_args.vcf

            # Assigns the input and output arguments for shapeit
            phase_call_args.extend(['--input-vcf', shapeit_input_vcf,
                                    '--output-max', phase_args.out_prefix,
                                    '--output-log', phase_args.out_prefix + '.phase.log'])

            # Assign the from bp position argument, if specified
            if phase_args.phase_from_bp:
                phase_call_args.extend(['--input-from', phase_args.phase_from_bp])

            # Assign the to bp position argument, if specified
            if phase_args.phase_to_bp:
                phase_call_args.extend(['--input-to', phase_args.phase_to_bp])

            # Call shapeit wrapper
            call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)

            # Assign expected phased output filename
            phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)

            # Combine the log files
            concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'],  phased_output + '.log')

            # Rename output to phase_args.out, if specified
            if phase_args.out:
                shutil.move(phased_output, phase_args.out)
                shutil.move(phased_output + '.log',  phase_args.out + '.log')

            logging.info('shapeit log file created')

            # Check if a chr subset file was created
            if phase_args.phase_chr and len(chrs_in_vcf) > 1:
                # Delete the chromosome-specific input
                os.remove(shapeit_input_vcf)

                logging.info('Chr %s subset deleted' % phase_args.phase_chr)

            # Remove intermediate files created by shapeit
            remove_intermediate_files(phase_args.out_prefix)

        # Check if multiple shapeit runs are required
        else:

            # List to store the phased filenames
            phased_filename_list = []

            # List to store the phased logs
            phased_log_list = []

            logging.info('Multi-chr shapeit phasing assigned')

            for chr_in_vcf in chrs_in_vcf:

                logging.info('Chr %s assigned' % chr_in_vcf)

                # Copy the arguments for this run
                chr_call_args = copy.deepcopy(phase_call_args)

                # Assign the chromosome-specific output prefix
                chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf

                # Assign the expected chromosome-specific output filename
                chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format)

                # Store the output filename
                phased_filename_list.append(chr_out_filename)

                # Assign the chromosome-specific input prefix
                chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf

                # Assign the expected chromosome-specific input filename
                chr_in_filename = chr_input_prefix + '.vcf.gz'

                # Create the chromosome-specific input
                chr_subset_file(phase_args.vcf,
                                chr_in_vcf,
                                chr_input_prefix,
                                'vcf.gz')

                # Assigns the input and output arguments for shapeit
                chr_call_args.extend(['--input-vcf', chr_in_filename,
                                      '--output-max', chr_out_prefix,
                                      '--output-log', chr_out_prefix + '.phase.log'])

                # Call shapeit wrapper
                call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format)

                # Combine log files
                concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'],  chr_out_filename + '.log')

                # Store the filename of the combined logs
                phased_log_list.append(chr_out_filename + '.log')

                # Delete the chromosome-specific input
                os.remove(chr_in_filename)

                # Remove intermediate files created by shapeit
                remove_intermediate_files(chr_out_prefix)

            # Concatenate the vcf files
            concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format)

            logging.info('Concatenated chromosomes')

            # Assign expected concatenated output filename
            phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)

            # Combine the log files
            concatenate_logs(phased_log_list, phased_output + '.log')

            # Rename output to phase_args.out, if specified
            if phase_args.out:
                shutil.move(phased_output, phase_args.out)
                shutil.move(phased_output + '.log',  phase_args.out + '.log')

            logging.info('Multi-chr shapeit log created')

    # Reverts the VCF input file
    if vcfname_renamed:
        os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)])

if __name__ == "__main__":
    initLogger()
    run()