diff vcf_phase.py @ 0:3830d29fca6a draft

Uploaded
author jaredgk
date Mon, 15 Oct 2018 18:15:47 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_phase.py	Mon Oct 15 18:15:47 2018 -0400
@@ -0,0 +1,542 @@
+import os
+import sys
+import copy
+import shutil
+import argparse
+import glob
+import logging
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
+
+from vcf_reader_func import checkFormat
+from logging_module import initLogger, logArgs
+from model import read_model_file
+from beagle import call_beagle
+from shapeit import call_shapeit, remove_intermediate_files
+from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate
+
+def phase_argument_parser(passed_arguments):
+    '''Phase Argument Parser - Assigns arguments for vcftools from command line.
+    Depending on the argument in question, a default value may be specified'''
+
+    def parser_confirm_file ():
+        '''Custom action to confirm file exists'''
+        class customAction(argparse.Action):
+            def __call__(self, parser, args, value, option_string=None):
+                if not os.path.isfile(value):
+                    raise IOError('%s not found' % value)
+                setattr(args, self.dest, value)
+        return customAction
+
+    def metavar_list (var_list):
+        '''Create a formmated metavar list for the help output'''
+        return '{' + ', '.join(var_list) + '}'
+
+    phase_parser = argparse.ArgumentParser()
+
+    # Input arguments
+    phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file())
+
+    # Model file arguments
+    phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file())
+    phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str)
+
+    # General arguments
+    phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true')
+    phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/')
+
+    # Phase algorithm argument
+    phasing_list = ['beagle', 'shapeit']
+    phasing_default = 'beagle'
+    phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default)
+
+    # Common phasing arguments
+    phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int)
+    phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int)
+    phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file())
+    phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str)
+    phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int)
+    phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int)
+
+    # Shapeit-specific options
+    phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int)
+    phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int)
+    phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int)
+    phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int)
+    phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float)
+
+    # Beagle-specific options
+    phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int)
+    phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int)
+    phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int)
+    phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float)
+    phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float)
+    phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float)
+    phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float)
+    phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int)
+
+    # Output arguments
+    phase_parser.add_argument('--out', help = 'Defines the output filename')
+    phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out')
+
+    out_format_list = ['vcf', 'vcf.gz', 'bcf']
+    out_format_default = 'vcf.gz'
+
+    phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default)
+
+    if passed_arguments:
+        return phase_parser.parse_args(passed_arguments)
+    else:
+        return phase_parser.parse_args()
+
+def concatenate_logs (log_files, new_log_filename):
+    '''
+        Concatenates log files
+
+        This function is used to concatenate log files. If a single chromosome
+        is being analyzed, the function is used to concatenate the log files
+        from multiple wrappers. If multiple chromosomes are being analyzed the
+        function is used to concatenate the logs of each chromosome.
+
+        Raises
+        ------
+        IOError
+            If a log file does not exist
+    '''
+
+    # Open the new log file
+    with open(new_log_filename, 'wb') as out_file:
+        # Loop the logs to be combined
+        for log_file in log_files:
+            # Confirm the log file exists, raise an error if not
+            if not os.path.isfile(log_file):
+                raise IOError('%s not found' % log_file)
+            # Open the log file
+            with open(log_file, 'rb') as in_file:
+                # Copy the contents of the file
+                shutil.copyfileobj(in_file, out_file)
+
+    # Remove old log files
+    for log_file in log_files:
+        os.remove(log_file)
+
+def assign_vcf_extension (filename):
+    # Checks if the file is unzipped, bgzipped, or gzipped
+    vcfname_format = checkFormat(filename)
+
+    if vcfname_format == 'vcf':
+        return '.vcf'
+
+    elif vcfname_format == 'gzip' or vcfname_format == 'bgzip':
+        return '.vcf.gz'
+
+    elif vcfname_format == 'bcf':
+        raise Exception('BCF not supported in current version')
+
+    else:
+        raise Exception('Unknown file format')
+
+def run (passed_arguments = []):
+    '''
+        Phaser for VCF files.
+
+        Automates the phasing process for a specified VCF file. The function
+        allows users to select between multiple phasing algorithms: beagle
+        (default) and shapit.
+
+        Parameters
+        ----------
+        --vcf : str
+            Specifies the input VCF filename
+        --phase-algorithm : str
+            Specifies the algorithm to be used. Choices: beagle (default) and
+            shapit
+        --out : str
+            Specifies the output filename
+
+        Returns
+        -------
+        output : file
+            Phased VCF file
+
+        Raises
+        ------
+        IOError
+            Input VCF file does not exist
+        IOError
+            Output file already exists
+
+    '''
+
+    # Grab VCF arguments from command line
+    phase_args = phase_argument_parser(passed_arguments)
+
+    # Adds the arguments (i.e. parameters) to the log file
+    logArgs(phase_args, func_name = 'vcf_phase')
+
+    # Assign file extension for VCF input file
+    vcfname_ext = assign_vcf_extension(phase_args.vcf)
+
+    logging.info('Input file assigned')
+
+    # Used to confirm if the VCF input was renamed
+    vcfname_renamed = False
+
+    # Confirm input has correct file extension
+    if vcfname_ext not in phase_args.vcf:
+        vcfname_renamed = True
+        os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext)
+        phase_args.vcf += vcfname_ext
+
+    # List to hold beagle/shapeit arguments
+    phase_call_args = []
+
+    # bool to check if an eclude file was created
+    exclude_file_created = False
+
+    # bool to check if an include file was created
+    include_file_created = False
+
+    # Performs actions related to --model-file and argument assignment
+    if phase_args.model_file:
+
+        # Check if a model has been specified
+        if not phase_args.model:
+            raise IOError('No selected model. Please use --model to select a model')
+
+        # Read in the model file
+        models_file = read_model_file(phase_args.model_file)
+
+        # Check that the selected model was found in the file
+        if phase_args.model not in models_file:
+            raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file))
+
+        logging.info('Model assigned')
+
+        if phase_args.phase_algorithm == 'beagle':
+
+            # Get list of individuals to include
+            inds_in_model = models_file[phase_args.model].inds
+
+            # Create file with individuals to exclude (beagle has no include option)
+            models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite)
+
+            # Assign exclude file
+            phase_call_args.append('excludesamples=' + models_file.exclude_file)
+
+            # Confirm that the exclude file was created
+            exclude_file_created = True
+
+        elif phase_args.phase_algorithm == 'shapeit':
+
+            # Assign the model
+            selected_model = models_file[phase_args.model]
+
+            # Create file with individuals to exclude (beagle has no include option)
+            selected_model.create_ind_file(overwrite = phase_args.overwrite)
+
+            # Assign exclude file
+            phase_call_args.extend(['--include-ind', selected_model.ind_file])
+
+            # Confirm that the include file was created
+            include_file_created = True
+
+    # Get the list of chromosomes within the VCF
+    chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf)
+
+    # Check if the user specified a specific chromosome
+    if phase_args.phase_chr:
+        # Check if the chromosome is within the file
+        if phase_args.phase_chr not in chrs_in_vcf:
+            raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf))
+
+    # Check that a chr was specified if a bp-based argument was specified
+    if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr:
+        # Check if the file has a single chromosome
+        if len(chrs_in_vcf) == 1:
+            # Check if the beagle algorithm is being used
+            if phase_args.phase_algorithm == 'beagle':
+                # Assign the chromosome, as beagle requires it to be assigned
+                phase_args.phase_chr = chrs_in_vcf[0]
+
+                logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0])
+        # Report error if multiple chromosomes are within the file
+        else:
+            raise Exception('The --phase-from-bp and --phase-to-bp arguments '
+                            'require the --phase-chr argument to function if '
+                            'multiple chrs are within %s' % phase_args.vcf)
+
+    # Assign general arguments and call beagle
+    if phase_args.phase_algorithm == 'beagle':
+
+        # Assigns the input and output arguments
+        phase_call_args.extend(['gt=' + phase_args.vcf,
+                                'out=' + phase_args.out_prefix])
+
+        # Assign the burn-in iter, if specified
+        if phase_args.beagle_burn_iter:
+            phase_call_args.append('burnin=' + phase_args.beagle_burn_iter)
+
+        # Assign the iter, if specified
+        if phase_args.beagle_iter:
+            phase_call_args.append('iterations=' + phase_args.beagle_iter)
+
+        # Assign the state count, if specified
+        if phase_args.beagle_states:
+            phase_call_args.append('phase-states=' + phase_args.beagle_states)
+
+        # Assign the window length, if specified
+        if phase_args.beagle_window:
+            phase_call_args.append('window=' + phase_args.beagle_window)
+
+        # Assign the window overlap, if specified
+        if phase_args.beagle_overlap:
+            phase_call_args.append('overlap=' + phase_args.beagle_overlap)
+
+        # Assign the HMM error, if specified
+        if phase_args.beagle_error:
+            phase_call_args.append('err=' + phase_args.beagle_error)
+
+        # Assign the step length, if specified
+        if phase_args.beagle_step:
+            phase_call_args.append('step=' + phase_args.beagle_step)
+
+        # Assign the number of steps, if specified
+        if phase_args.beagle_nsteps:
+            phase_call_args.append('nsteps=' + phase_args.beagle_nsteps)
+
+        # Assign the genetic map, if specified
+        if phase_args.genetic_map:
+            phase_call_args.append('map=' + phase_args.genetic_map)
+
+        # Assign the effective pop size, if specified
+        if phase_args.Ne:
+            phase_call_args.append('ne=' + phase_args.Ne)
+
+        # Assign the random seed, if specified
+        if phase_args.random_seed:
+            phase_call_args.append('seed=' + phase_args.random_seed)
+
+        # Assign the chromosome to phase, if specified
+        if phase_args.phase_chr:
+
+            # Store the chromosome argument
+            chr_arg = 'chrom=%s' % phase_args.phase_chr
+
+            # Check if either bp position arguments were specified
+            if phase_args.phase_from_bp or phase_args.phase_to_bp:
+
+                # List of the position arguments, in their required order
+                position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp]
+
+                # Filter the position arguments to remove empty values
+                filttered_position_args = filter(None, position_args)
+
+                # Map the arguments to str and add them to the chromosome argument
+                chr_arg += ''.join(map(str, filttered_position_args))
+
+            # Add the chromosome argument the argument list
+            phase_call_args.append(chr_arg)
+
+        # Call beagle wrapper
+        call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)
+
+        # Assign expected phased output filename
+        phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
+
+        # Rename output to phase_args.out, if specified
+        if phase_args.out:
+            shutil.move(phased_output, phase_args.out)
+            shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log')
+        # Rename log using phased_output
+        else:
+            shutil.move(phase_args.out_prefix + '.log', phased_output + '.log')
+
+        logging.info('beagle log file created')
+
+    # Assign general arguments and call shapeit
+    elif phase_args.phase_algorithm == 'shapeit':
+
+        # Assign the shapeit burn in iter, if specified
+        if phase_args.shapeit_burn_iter:
+            phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter])
+
+        # Assign the shapeit prune iter, if specified
+        if phase_args.shapeit_prune_iter:
+            phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter])
+
+        # Assign the shapeit main iter, if specified
+        if phase_args.shapeit_main_iter:
+            phase_call_args.extend(['--main', phase_args.shapeit_main_iter])
+
+        # Assign the number of shapeit states if specified
+        if phase_args.shapeit_states:
+            phase_call_args.extend(['--states', phase_args.shapeit_states])
+
+        # Assign the shapeit window size, if specified
+        if phase_args.shapeit_window:
+            phase_call_args.extend(['--window', phase_args.shapeit_window])
+
+        # Assign the genetic map, if specified
+        if phase_args.genetic_map:
+            phase_call_args.extend(['--input-map', phase_args.genetic_map])
+
+        # Assign the effective pop size, if specified
+        if phase_args.Ne:
+            phase_call_args.extend(['--effective-size', phase_args.Ne])
+
+        # Assign the random seed, if specified
+        if phase_args.random_seed:
+            phase_call_args.extend(['--seed', phase_args.random_seed])
+
+        # Check if only a single shapeit run is required
+        if phase_args.phase_chr or len(chrs_in_vcf) == 1:
+
+            # Holds shapeit input filename, as a temp file may be required
+            shapeit_input_vcf = None
+
+            # Check if a single chromosome is selected
+            if phase_args.phase_chr and len(chrs_in_vcf) > 1:
+                # Assign the chromosome-specific input prefix
+                chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr
+
+                # Assign the chr-specific file as the shapeit input filename
+                shapeit_input_vcf = chr_input_prefix + '.vcf.gz'
+
+                # Create the chromosome-specific input
+                chr_subset_file(phase_args.vcf,
+                                phase_args.phase_chr,
+                                chr_input_prefix,
+                                'vcf.gz',
+                                from_bp = phase_args.phase_from_bp,
+                                to_bp = phase_args.phase_to_bp)
+
+                logging.info('Chr %s subset created' % phase_args.phase_chr)
+
+            else:
+                # Assign the shapeit input filename
+                shapeit_input_vcf = phase_args.vcf
+
+            # Assigns the input and output arguments for shapeit
+            phase_call_args.extend(['--input-vcf', shapeit_input_vcf,
+                                    '--output-max', phase_args.out_prefix,
+                                    '--output-log', phase_args.out_prefix + '.phase.log'])
+
+            # Assign the from bp position argument, if specified
+            if phase_args.phase_from_bp:
+                phase_call_args.extend(['--input-from', phase_args.phase_from_bp])
+
+            # Assign the to bp position argument, if specified
+            if phase_args.phase_to_bp:
+                phase_call_args.extend(['--input-to', phase_args.phase_to_bp])
+
+            # Call shapeit wrapper
+            call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)
+
+            # Assign expected phased output filename
+            phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
+
+            # Combine the log files
+            concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'],  phased_output + '.log')
+
+            # Rename output to phase_args.out, if specified
+            if phase_args.out:
+                shutil.move(phased_output, phase_args.out)
+                shutil.move(phased_output + '.log',  phase_args.out + '.log')
+
+            logging.info('shapeit log file created')
+
+            # Check if a chr subset file was created
+            if phase_args.phase_chr and len(chrs_in_vcf) > 1:
+                # Delete the chromosome-specific input
+                os.remove(shapeit_input_vcf)
+
+                logging.info('Chr %s subset deleted' % phase_args.phase_chr)
+
+            # Remove intermediate files created by shapeit
+            remove_intermediate_files(phase_args.out_prefix)
+
+        # Check if multiple shapeit runs are required
+        else:
+
+            # List to store the phased filenames
+            phased_filename_list = []
+
+            # List to store the phased logs
+            phased_log_list = []
+
+            logging.info('Multi-chr shapeit phasing assigned')
+
+            for chr_in_vcf in chrs_in_vcf:
+
+                logging.info('Chr %s assigned' % chr_in_vcf)
+
+                # Copy the arguments for this run
+                chr_call_args = copy.deepcopy(phase_call_args)
+
+                # Assign the chromosome-specific output prefix
+                chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf
+
+                # Assign the expected chromosome-specific output filename
+                chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format)
+
+                # Store the output filename
+                phased_filename_list.append(chr_out_filename)
+
+                # Assign the chromosome-specific input prefix
+                chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf
+
+                # Assign the expected chromosome-specific input filename
+                chr_in_filename = chr_input_prefix + '.vcf.gz'
+
+                # Create the chromosome-specific input
+                chr_subset_file(phase_args.vcf,
+                                chr_in_vcf,
+                                chr_input_prefix,
+                                'vcf.gz')
+
+                # Assigns the input and output arguments for shapeit
+                chr_call_args.extend(['--input-vcf', chr_in_filename,
+                                      '--output-max', chr_out_prefix,
+                                      '--output-log', chr_out_prefix + '.phase.log'])
+
+                # Call shapeit wrapper
+                call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format)
+
+                # Combine log files
+                concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'],  chr_out_filename + '.log')
+
+                # Store the filename of the combined logs
+                phased_log_list.append(chr_out_filename + '.log')
+
+                # Delete the chromosome-specific input
+                os.remove(chr_in_filename)
+
+                # Remove intermediate files created by shapeit
+                remove_intermediate_files(chr_out_prefix)
+
+            # Concatenate the vcf files
+            concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format)
+
+            logging.info('Concatenated chromosomes')
+
+            # Assign expected concatenated output filename
+            phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
+
+            # Combine the log files
+            concatenate_logs(phased_log_list, phased_output + '.log')
+
+            # Rename output to phase_args.out, if specified
+            if phase_args.out:
+                shutil.move(phased_output, phase_args.out)
+                shutil.move(phased_output + '.log',  phase_args.out + '.log')
+
+            logging.info('Multi-chr shapeit log created')
+
+    # Reverts the VCF input file
+    if vcfname_renamed:
+        os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)])
+
+if __name__ == "__main__":
+    initLogger()
+    run()