Mercurial > repos > jaredgk > ppp_vcfphase
changeset 0:3830d29fca6a draft
Uploaded
author | jaredgk |
---|---|
date | Mon, 15 Oct 2018 18:15:47 -0400 |
parents | |
children | 15245deda141 |
files | bcftools.py beagle.py logging_module.py model.py shapeit.py vcf_phase.py vcf_phase.xml vcf_reader_func.py |
diffstat | 8 files changed, 1969 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bcftools.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,131 @@ +import os +import sys +import logging +import subprocess + +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) + +from vcf_reader_func import checkFormat + +def check_bcftools_for_errors (bcftools_stderr): + ''' + Checks the bgzip stderr for errors + + Parameters + ---------- + bcftools_stderr : str + bcftools stderr + + Raises + ------ + IOError + If bcftools stderr returns an error + ''' + + # Expand as errors are discovered + if bcftools_stderr: + logging.error(vcftools_stderr) + raise Exception(vcftools_stderr) + +def call_bcftools (bcftools_call_args): + + # bcftools subprocess call + bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Wait for bcftools to finish + bcftools_out, bcftools_err = bcftools_call.communicate() + + check_bcftools_for_errors(bcftools_err) + + logging.info('bcftools call complete') + +def check_for_index (filename): + + # Assign the file format + file_format = checkFormat(filename) + + # Check if the file to be indexed is a vcf.gz + if file_format == 'bgzip': + # Check if the index (.tbi) exists + if os.path.isfile(filename + '.tbi'): + return True + + # Check if the file to be indexed is a bcf + elif file_format == 'bcf': + # Check if the index (.csi) exists + if os.path.isfile(filename + '.csi'): + return True + + # Return false if no index is found + return False + +def create_index (filename): + + # Assign the file format + file_format = checkFormat(filename) + + # Check if the file to be indexed is a vcf.gz + if file_format == 'bgzip': + # Create a index (.tbi) + call_bcftools(['index', '-t', filename]) + + # Check if the file to be indexed is a bcf + elif file_format == 'bcf': + # Create a index (.csi) + call_bcftools(['index', '-c', filename]) + + # Report if file cannot be indexed + else: + raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) + +def convert_to_bcf (filename, output_prefix): + + # Holds the arguments to convert to BCF format + convert_args = ['convert', '-O', 'b'] + + # Stores the specified output_prefix to the BCF file + bcf_output = '%s.bcf' % output_prefix + + # Assigns the output file to the arguments + convert_args.extend(['-o', bcf_output]) + + # Assigns the specified input to the arguments + convert_args.append(filename) + + # Call bcftools + call_bcftools(convert_args) + + +def convert_to_vcf (filename, output_prefix): + + # Holds the arguments to convert to VCF format + convert_args = ['view', '-O', 'v'] + + # Stores the specified output_prefix to the VCF file + vcf_output = '%s.vcf' % output_prefix + + # Assigns the output file to the arguments + convert_args.extend(['-o', vcf_output]) + + # Assigns the specified input to the arguments + convert_args.append(filename) + + # Call bcftools + call_bcftools(convert_args) + +def convert_to_vcfgz (filename, output_prefix): + + # Holds the arguments to convert to VCFGZ format + convert_args = ['view', '-O', 'z'] + + # Stores the specified output_prefix to the VCFGZ file + vcfgz_output = '%s.vcf.gz' % output_prefix + + # Assigns the output file to the arguments + convert_args.extend(['-o', vcfgz_output]) + + # Assigns the specified input to the arguments + convert_args.append(filename) + + # Call bcftools + call_bcftools(convert_args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/beagle.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,163 @@ +import os +import sys +import subprocess +import shutil +import argparse +import glob +import logging + +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) + +from vcf_reader_func import checkFormat +from logging_module import initLogger, logArgs +from vcftools import bgzip_decompress_vcfgz +from bcftools import convert_to_bcf, check_for_index, create_index + +def delete_beagle_log (output_prefix): + ''' + Delete beagle log file + + This function is used to delete beagle's log file if an error is + encountered. A warning is produced if the log file cannot be found. + + Parameters + ---------- + output_prefix : str + Output file prefix + ''' + + # Check that log file exists, if not return warning + if not os.path.isfile(output_prefix + '.log'): + logging.warning('beagle log file %s.log does not exist' % output_prefix) + else: + os.remove(output_prefix + '.log') + +def check_beagle_for_errors (beagle_stderr, output_prefix): + ''' + Checks the beagle stdout for errors + + Parameters + ---------- + beagle_stderr : str + beagle stderr + output_prefix : str + Output file prefix + + Raises + ------ + Exception + If beagle stdout returns an error + ''' + + # Check if beagle completed without an error + if not beagle_stderr.strip(): + pass + + # Print missing data message if that is likely + elif 'ERROR: genotype is missing allele separator:' in str(beagle_stderr): + # Delete the beagle log file + delete_beagle_log(output_prefix) + + # Store reported error + error_reported = 'ERROR: genotype is missing allele separator' + # Store message for user about error + user_message = 'Please confirm the input has no missing data.' + # Report on the error + raise Exception(error_reported + '\n' + user_message) + + # Print output for beagle if error is detected + elif 'ERROR:' in str(beagle_stderr): + # Delete the beagle log file + delete_beagle_log(output_prefix) + + # Splits log into list of lines + beagle_stderr_lines = beagle_stderr.splitlines() + # Prints the error(s) + raise Exception('\n'.join((output_line for output_line in beagle_stderr_lines if output_line.startswith('ERROR:')))) + + # Print output if not completed and no error found. Unlikely to be used, but included. + else: + # Delete the beagle log file + delete_beagle_log(output_prefix) + + raise Exception(beagle_stderr) + + +def standard_beagle_call (beagle_path, beagle_call_args, output_prefix): + ''' + Calls beagle using subprocess + + This function is used to call beagle under standard conditions. The + functions then passes the stderr to check_beagle_for_errors to check + for errors. + + Parameters + ---------- + beagle_path : str + Path to beagle.jar + beagle_call_args : list + Argument list for beagle + output_prefix : str + Output file prefix + ''' + + # Assign location of beagle jar file + beagle_jar = os.path.join(beagle_path, 'beagle.jar') + + # Check that beagle.jar exists + if not os.path.isfile(beagle_jar): + raise IOError('beagle.jar not found. Path specified: %s' % beagle_path) + + logging.info('beagle phasing parameters assigned') + + # Phasing subprocess call + phase_call = subprocess.Popen(['java', '-jar', beagle_jar] + beagle_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + phase_stdout, phase_stderr = phase_call.communicate() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + phase_stderr = phase_stderr.decode() + + # Check beagle call for errors + check_beagle_for_errors(phase_stderr, output_prefix) + + logging.info('beagle phasing complete') + +def call_beagle (beagle_path, beagle_call_args, output_prefix, output_format): + ''' + Automates beagle calls + + This function passes the argument list to standard_beagle_call. Once the + beagle call has finished, the function will automatically convert the + bgzip compressed output of beagle to BCF and VCF, if either format is + specified. + + Parameters + ---------- + beagle_path : str + Path to beagle.jar + beagle_call_args : list + Argument list for beagle + output_prefix : str + Output file prefix + output_format : str + Output file format + ''' + print beagle_call_args + # Standard call to beagle + standard_beagle_call(beagle_path, beagle_call_args, output_prefix) + + # Decompress if a VCF files is requested + if output_format == 'vcf': + bgzip_decompress_vcfgz(output_prefix + '.vcf.gz') + + # Convert to BCF if requested + elif output_format == 'bcf': + + # Check if there is an index file + if check_for_index(output_prefix + '.vcf.gz') == False: + # Create an index if not found + create_index(output_prefix + '.vcf.gz') + # Convert vcf.gz to bcf + convert_to_bcf(output_prefix + '.vcf.gz', output_prefix)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/logging_module.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,97 @@ +import sys +import logging + + + +def initLogger(filename='pipeline.log', filelevel='INFO', + streamlevel='WARNING', resetlog=True): + """General logger initialization for PPP functions. + + Messages from WARNING level and higher will be logged to file and to + stderr. By default, INFO level will also be written to logfile. Both + levels are flexible. + + Level groupings: + ERROR: Error messages should be generated by an exception call + WARNING: Non-terminal behavior that may be unusual (i.e. lists + with no values, blank strings) + INFO: Every major (user-facing) function should have the following: + -Message for function start + -List of input arguments and options + -Basic sanity checks (dimensions of input data) + -Statements before or after major function calls + -Message for function end + DEBUG: Mainly for developer use/debugging. Generate logs for + sub-functions that match INFO level for major functions. + Possible care should be used if there are lots of loops called. + + Use: Call with either the individual function (in __name__=="__main__" + statement) or in pipeline file. + + Parameters + ---------- + filename : str ("pipeline.log") + Name of file that log will be written to + + filelevel : {'INFO','DEBUG','WARNING','ERROR'} + Set minimum level of log messages that are written to log file. + Note that this acts as a de facto minumum for 'streamlevel' as well. + + streamlevel : {'WARNING','DEBUG','INFO','ERROR'} + Set minimum level of log messages that are output to stream. + + resetlog : bool (True) + If true, will overwrite logfile when opening. Set to false if log is + being initialized multiple times + + Returns + ------- + None + + Exceptions + ---------- + If filelevel or streamlevel are not a valid logging level + + """ + log_levels = ['DEBUG','INFO','WARNING','ERROR'] + if filelevel is not None and filelevel.upper() not in log_levels: + raise Exception('filelevel value %s is not a valid level' % + filelevel) + if streamlevel is not None and streamlevel.upper() not in log_levels: + raise Exception('streamlevel value %s is not a valid level' % + streamlevel) + fmt_def = "%(asctime)s - %(funcName)s - %(levelname)s: %(message)s" + fmt_notime = "%(funcName)s - %(levelname)s: %(message)s" + fmtr = logging.Formatter(fmt=fmt_def) + fmtr_notime = logging.Formatter(fmt=fmt_notime) + filelogger = logging.getLogger() + filelogger.setLevel('INFO') + if streamlevel is not None: + s_handler = logging.StreamHandler() + s_handler.setFormatter(fmtr_notime) + s_handler.setLevel(streamlevel) + filelogger.addHandler(s_handler) + logmode = 'a' + if resetlog: + logmode = 'w' + if filelevel is not None: + f_handler = logging.FileHandler(filename,mode=logmode) + f_handler.setFormatter(fmtr) + f_handler.setLevel(filelevel) + #filelogger.setLevel(filelevel) + filelogger.addHandler(f_handler) + #Formats exception messages to be sent to appropriate loggers + def exp_handler(etype,val,tb): + logging.error("%s" % (val), exc_info=(etype,val,tb)) + + sys.excepthook = exp_handler + + +def logArgs(args, func_name=None, print_nones=False): + header = "Arguments" + if func_name is not None: + header+=" for"+func_name + for arg in vars(args): + val = vars(args)[arg] + if val is not None or print_nones: + logging.info('Argument %s: %s' % (arg,val))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/model.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,236 @@ +import os +import sys +import json +import subprocess +import argparse +import logging +import itertools + +from collections import defaultdict + +# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes. +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) + +from logging_module import initLogger + +class ModelFile(dict): + def __init__(self, *arg, **kw): + super(ModelFile, self).__init__(*arg, **kw) + self.inds = [] + self.ind_file = '' + self.exclude_file = '' + + def assign_inds (self, inds = []): + # Return error if inds is empty + if not inds: + raise IOError('No individuals found in the model file.') + # Store the individuals + self.inds = [str(ind) for ind in inds] + + def create_ind_file (self, file_ext = '', file_path = '', overwrite = False): + # Assign the filename for the population file + ind_filename = 'unique_individuals' + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + ind_filename = os.path.join(file_path, ind_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(ind_filename): + raise IOError('Individuals file exists.') + + # Create the population file + ind_file = open(ind_filename, 'w') + ind_file.write('%s\n' %'\n'.join(self.inds)) + ind_file.close() + + # Save the individuals filename + self.ind_file = ind_filename + + def delete_ind_file (self): + # Check if an individuals file was created + if self.ind_file: + + # Delete the individuals file + os.remove(self.ind_file) + + # Remove the filename + self.ind_file = '' + + def create_exclude_ind_file (self, inds_to_include = [], file_ext = '', file_path = '', overwrite = False): + # Assign the filename for the population file + ind_filename = 'exclude_individuals' + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + ind_filename = os.path.join(file_path, ind_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(ind_filename): + raise IOError('Individuals file exists.') + + # Create exclude list by removing included individuals + exclude_inds = list(set(self.inds) - set(inds_to_include)) + + # Create the population file + ind_file = open(ind_filename, 'w') + ind_file.write('%s\n' %'\n'.join(exclude_inds)) + ind_file.close() + + # Save the individuals filename + self.exclude_file = ind_filename + + def delete_ind_file (self): + # Check if an individuals file was created + if self.exclude_file: + + # Delete the individuals file + os.remove(self.exclude_file) + + # Remove the filename + self.exclude_file = '' + +class Model: + def __init__ (self, name): + self.name = name + self.tree = '' + self.npop = 0 + self.pop_list = [] + self.nind = defaultdict(int) + self.ind_dict = defaultdict(list) + self.pop_files = [] + self.ind_file = '' + + @property + def inds(self): + return list(itertools.chain.from_iterable(self.ind_dict.values())) + + def assign_tree (self, tree): + self.tree = str(tree) + + def assign_pop (self, pop, inds = []): + self.npop += 1 + self.pop_list.append(str(pop)) + if inds: + self.nind[pop] = len(inds) + self.ind_dict[pop] = [str(ind) for ind in inds] + + def create_pop_files (self, file_ext = '', file_path = '', overwrite = False): + for pop in self.pop_list: + # Assign the filename for the population file + pop_filename = pop + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + pop_filename = os.path.join(file_path, pop_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(pop_filename): + raise IOError('Population file exists.') + + # Create the population file + pop_file = open(pop_filename, 'w') + pop_file.write('%s\n' %'\n'.join(self.ind_dict[pop])) + pop_file.close() + + # Save the population filename + self.pop_files.append(pop_filename) + + def delete_pop_files (self): + # Check if pop files were created + if len(self.pop_files) != 0: + + # Loop the created pop files + for pop_file in self.pop_files: + # Delete the pop file + os.remove(pop_file) + + # Remove the filenames + self.pop_files = [] + + def create_ind_file (self, file_ext = '', file_path = '', overwrite = False): + # Assign the filename for the population file + ind_filename = 'individual.keep' + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + ind_filename = os.path.join(file_path, ind_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(ind_filename): + raise IOError('Individuals file exists.') + + # Create the population file + ind_file = open(ind_filename, 'w') + ind_file.write('%s\n' %'\n'.join(self.inds)) + ind_file.close() + + # Save the individuals filename + self.ind_file = ind_filename + + def delete_ind_file (self): + # Check if an individuals file was created + if self.ind_file: + + # Delete the individuals file + os.remove(self.ind_file) + + # Remove the filename + self.ind_file = '' + +def read_model_file (model_filename): + + # Check that the file exists + if not os.path.isfile(model_filename): + raise IOError + + # Create ModelFile object + models_to_return = ModelFile() + + # Check if using python 2 or 3 + if sys.version_info[0] == 2: + # Open the model file in python 2 + model_file = open(model_filename, 'rU') + else: + # Open the model file in python 3 + model_file = open(model_filename, 'r', newline=None) + + # Parse the model file using the json reader + models_dict = json.load(model_file) + + # List to store all unique individuals (i.e. individuals in all models) + individual_list = [] + + # Loop the parsed models + for model_dict in models_dict: + + # Create the model + model = Model(model_dict['name']) + + # Loop the populations in the model + for pop, pop_dict in model_dict['pops'].items(): + + # Assign the population ans it's individuals to the model + model.assign_pop(pop, pop_dict['inds']) + # Assign the individuals to the unique individual list + individual_list.extend(pop_dict['inds']) + + # Remove duplicates from the unique individual list + individual_list = list(set(individual_list)) + + # Save the model + models_to_return[str(model.name)] = model + + # Store the unique individuals within the ModelFile object + models_to_return.assign_inds(individual_list) + + # Return the models + return models_to_return
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/shapeit.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,160 @@ +import os +import sys +import subprocess +import shutil +import argparse +import glob +import logging + +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) + +from vcf_reader_func import checkFormat +from logging_module import initLogger, logArgs +from plink import convert_haps_to_vcf +#from vcftools import bgzip_decompress_vcfgz +#from bcftools import convert_to_bcf, check_for_index, create_index + +def check_shapeit_for_errors (shapeit_stdout, output_prefix): + ''' + Checks the shapeit stdout for errors + + Parameters + ---------- + shapeit_stdout : str + shapeit stdout + output_prefix : str + Output filename prefix + + Raises + ------ + Exception + If shapeit stdout returns an error + ''' + + # Returns True if the job completed without error + if 'Running time:' in str(shapeit_stdout): + pass + + # Print output if not completed and no error found. Unlikely to be used, but included. + else: + # Remove intermediate files before reporting the error + remove_intermediate_files(output_prefix, error_intermediates = True) + raise Exception(str(shapeit_stdout)) + +def remove_intermediate_files (output_prefix, error_intermediates = False): + ''' + Removes shapeit intermediate files + + This function is used to remove the various intermediate files created + by shapeit. The exact intermediate files to be removed are defined by + the error-state of shapeit. The function will also return warnings if + the intermediate files were not found. + + Parameters + ---------- + output_prefix : str + Output filename prefix + error_intermediates : bool, optional + Defines if shapeit encountered an error + + ''' + if error_intermediates: + + # Check that the log file was created, give a warning otherwise + if not os.path.isfile(output_prefix + '.phase.log'): + logging.warning('shapeit intermediate file %s.phase.log does not exist' % output_prefix) + else: + # Remove shapeit log file + os.remove(output_prefix + '.phase.log') + + else: + + # Check that the phase.ind.mm file was created, give a warning otherwise + if not os.path.isfile(output_prefix + '.phase.ind.mm'): + logging.warning('shapeit intermediate file %s.phase.ind.mm does not exist' % output_prefix) + else: + # Remove shapeit phase.ind.mm file + os.remove(output_prefix + '.phase.ind.mm') + + # Check that the phase.snp.mm file was created, give a warning otherwise + if not os.path.isfile(output_prefix + '.phase.snp.mm'): + logging.warning('shapeit intermediate file %s.phase.snp.mm does not exist' % output_prefix) + else: + # Remove shapeit phase.snp.mm file + os.remove(output_prefix + '.phase.snp.mm') + + # Check that the haps file was created, give a warning otherwise + if not os.path.isfile(output_prefix + '.haps'): + logging.warning('shapeit intermediate file %s.haps does not exist' % output_prefix) + else: + # Remove shapeit haps file + os.remove(output_prefix + '.haps') + + # Check that the sample file was created, give a warning otherwise + if not os.path.isfile(output_prefix + '.sample'): + logging.warning('shapeit intermediate file %s.sample does not exist' % output_prefix) + else: + # Remove shapeit sample file + os.remove(output_prefix + '.sample') + + logging.info('shapeit-related files removed') + +def standard_shapeit_call (shapeit_call_args, output_prefix): + ''' + Calls shapeit using subprocess + + This function is used to call shapeit and passes the resulting stdout + to check_shapeit_for_errors to check for errors. The function also + passes output_prefix to check_shapeit_for_errors to delete shapeit + intermediate files if shapeit results in an error. + + Parameters + ---------- + shapeit_call_args : list + Argument list for shapeit + output_prefix : str + Output filename prefix + + ''' + + logging.info('shapeit phasing parameters assigned') + + # Phasing subprocess call + phase_call = subprocess.Popen(['shapeit'] + shapeit_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE) + phase_stdout, phase_stderr = phase_call.communicate() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + phase_stdout = phase_stdout.decode() + + # Check shapeit call for errors + check_shapeit_for_errors(phase_stdout, output_prefix) + + logging.info('shapeit phasing complete (HAPS format)') + +def call_shapeit (shapeit_call_args, output_prefix, output_format): + ''' + Calls shapeit and automates file conversions + + The function is used to call shapeit and also automates conversion to + VCF, VCF.GZ, and BCF using plink2 + + Parameters + ---------- + shapeit_call_args : list + Argument list for shapeit + output_prefix : str + Output filename prefix + output_format : str + Output file format + + ''' + + # Standard call to beagle + standard_shapeit_call(shapeit_call_args, output_prefix) + + # Convert haps-format to vcf + convert_haps_to_vcf(output_prefix, output_format) + + logging.info('HAPS conversion to VCF complete')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_phase.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,542 @@ +import os +import sys +import copy +import shutil +import argparse +import glob +import logging + +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) + +from vcf_reader_func import checkFormat +from logging_module import initLogger, logArgs +from model import read_model_file +from beagle import call_beagle +from shapeit import call_shapeit, remove_intermediate_files +from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate + +def phase_argument_parser(passed_arguments): + '''Phase Argument Parser - Assigns arguments for vcftools from command line. + Depending on the argument in question, a default value may be specified''' + + def parser_confirm_file (): + '''Custom action to confirm file exists''' + class customAction(argparse.Action): + def __call__(self, parser, args, value, option_string=None): + if not os.path.isfile(value): + raise IOError('%s not found' % value) + setattr(args, self.dest, value) + return customAction + + def metavar_list (var_list): + '''Create a formmated metavar list for the help output''' + return '{' + ', '.join(var_list) + '}' + + phase_parser = argparse.ArgumentParser() + + # Input arguments + phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file()) + + # Model file arguments + phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file()) + phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str) + + # General arguments + phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true') + phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/') + + # Phase algorithm argument + phasing_list = ['beagle', 'shapeit'] + phasing_default = 'beagle' + phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default) + + # Common phasing arguments + phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int) + phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int) + phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file()) + phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str) + phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int) + phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int) + + # Shapeit-specific options + phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int) + phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int) + phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int) + phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int) + phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float) + + # Beagle-specific options + phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int) + phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int) + phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int) + phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float) + phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float) + phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float) + phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float) + phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int) + + # Output arguments + phase_parser.add_argument('--out', help = 'Defines the output filename') + phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out') + + out_format_list = ['vcf', 'vcf.gz', 'bcf'] + out_format_default = 'vcf.gz' + + phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default) + + if passed_arguments: + return phase_parser.parse_args(passed_arguments) + else: + return phase_parser.parse_args() + +def concatenate_logs (log_files, new_log_filename): + ''' + Concatenates log files + + This function is used to concatenate log files. If a single chromosome + is being analyzed, the function is used to concatenate the log files + from multiple wrappers. If multiple chromosomes are being analyzed the + function is used to concatenate the logs of each chromosome. + + Raises + ------ + IOError + If a log file does not exist + ''' + + # Open the new log file + with open(new_log_filename, 'wb') as out_file: + # Loop the logs to be combined + for log_file in log_files: + # Confirm the log file exists, raise an error if not + if not os.path.isfile(log_file): + raise IOError('%s not found' % log_file) + # Open the log file + with open(log_file, 'rb') as in_file: + # Copy the contents of the file + shutil.copyfileobj(in_file, out_file) + + # Remove old log files + for log_file in log_files: + os.remove(log_file) + +def assign_vcf_extension (filename): + # Checks if the file is unzipped, bgzipped, or gzipped + vcfname_format = checkFormat(filename) + + if vcfname_format == 'vcf': + return '.vcf' + + elif vcfname_format == 'gzip' or vcfname_format == 'bgzip': + return '.vcf.gz' + + elif vcfname_format == 'bcf': + raise Exception('BCF not supported in current version') + + else: + raise Exception('Unknown file format') + +def run (passed_arguments = []): + ''' + Phaser for VCF files. + + Automates the phasing process for a specified VCF file. The function + allows users to select between multiple phasing algorithms: beagle + (default) and shapit. + + Parameters + ---------- + --vcf : str + Specifies the input VCF filename + --phase-algorithm : str + Specifies the algorithm to be used. Choices: beagle (default) and + shapit + --out : str + Specifies the output filename + + Returns + ------- + output : file + Phased VCF file + + Raises + ------ + IOError + Input VCF file does not exist + IOError + Output file already exists + + ''' + + # Grab VCF arguments from command line + phase_args = phase_argument_parser(passed_arguments) + + # Adds the arguments (i.e. parameters) to the log file + logArgs(phase_args, func_name = 'vcf_phase') + + # Assign file extension for VCF input file + vcfname_ext = assign_vcf_extension(phase_args.vcf) + + logging.info('Input file assigned') + + # Used to confirm if the VCF input was renamed + vcfname_renamed = False + + # Confirm input has correct file extension + if vcfname_ext not in phase_args.vcf: + vcfname_renamed = True + os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext) + phase_args.vcf += vcfname_ext + + # List to hold beagle/shapeit arguments + phase_call_args = [] + + # bool to check if an eclude file was created + exclude_file_created = False + + # bool to check if an include file was created + include_file_created = False + + # Performs actions related to --model-file and argument assignment + if phase_args.model_file: + + # Check if a model has been specified + if not phase_args.model: + raise IOError('No selected model. Please use --model to select a model') + + # Read in the model file + models_file = read_model_file(phase_args.model_file) + + # Check that the selected model was found in the file + if phase_args.model not in models_file: + raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file)) + + logging.info('Model assigned') + + if phase_args.phase_algorithm == 'beagle': + + # Get list of individuals to include + inds_in_model = models_file[phase_args.model].inds + + # Create file with individuals to exclude (beagle has no include option) + models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite) + + # Assign exclude file + phase_call_args.append('excludesamples=' + models_file.exclude_file) + + # Confirm that the exclude file was created + exclude_file_created = True + + elif phase_args.phase_algorithm == 'shapeit': + + # Assign the model + selected_model = models_file[phase_args.model] + + # Create file with individuals to exclude (beagle has no include option) + selected_model.create_ind_file(overwrite = phase_args.overwrite) + + # Assign exclude file + phase_call_args.extend(['--include-ind', selected_model.ind_file]) + + # Confirm that the include file was created + include_file_created = True + + # Get the list of chromosomes within the VCF + chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf) + + # Check if the user specified a specific chromosome + if phase_args.phase_chr: + # Check if the chromosome is within the file + if phase_args.phase_chr not in chrs_in_vcf: + raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf)) + + # Check that a chr was specified if a bp-based argument was specified + if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr: + # Check if the file has a single chromosome + if len(chrs_in_vcf) == 1: + # Check if the beagle algorithm is being used + if phase_args.phase_algorithm == 'beagle': + # Assign the chromosome, as beagle requires it to be assigned + phase_args.phase_chr = chrs_in_vcf[0] + + logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0]) + # Report error if multiple chromosomes are within the file + else: + raise Exception('The --phase-from-bp and --phase-to-bp arguments ' + 'require the --phase-chr argument to function if ' + 'multiple chrs are within %s' % phase_args.vcf) + + # Assign general arguments and call beagle + if phase_args.phase_algorithm == 'beagle': + + # Assigns the input and output arguments + phase_call_args.extend(['gt=' + phase_args.vcf, + 'out=' + phase_args.out_prefix]) + + # Assign the burn-in iter, if specified + if phase_args.beagle_burn_iter: + phase_call_args.append('burnin=' + phase_args.beagle_burn_iter) + + # Assign the iter, if specified + if phase_args.beagle_iter: + phase_call_args.append('iterations=' + phase_args.beagle_iter) + + # Assign the state count, if specified + if phase_args.beagle_states: + phase_call_args.append('phase-states=' + phase_args.beagle_states) + + # Assign the window length, if specified + if phase_args.beagle_window: + phase_call_args.append('window=' + phase_args.beagle_window) + + # Assign the window overlap, if specified + if phase_args.beagle_overlap: + phase_call_args.append('overlap=' + phase_args.beagle_overlap) + + # Assign the HMM error, if specified + if phase_args.beagle_error: + phase_call_args.append('err=' + phase_args.beagle_error) + + # Assign the step length, if specified + if phase_args.beagle_step: + phase_call_args.append('step=' + phase_args.beagle_step) + + # Assign the number of steps, if specified + if phase_args.beagle_nsteps: + phase_call_args.append('nsteps=' + phase_args.beagle_nsteps) + + # Assign the genetic map, if specified + if phase_args.genetic_map: + phase_call_args.append('map=' + phase_args.genetic_map) + + # Assign the effective pop size, if specified + if phase_args.Ne: + phase_call_args.append('ne=' + phase_args.Ne) + + # Assign the random seed, if specified + if phase_args.random_seed: + phase_call_args.append('seed=' + phase_args.random_seed) + + # Assign the chromosome to phase, if specified + if phase_args.phase_chr: + + # Store the chromosome argument + chr_arg = 'chrom=%s' % phase_args.phase_chr + + # Check if either bp position arguments were specified + if phase_args.phase_from_bp or phase_args.phase_to_bp: + + # List of the position arguments, in their required order + position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp] + + # Filter the position arguments to remove empty values + filttered_position_args = filter(None, position_args) + + # Map the arguments to str and add them to the chromosome argument + chr_arg += ''.join(map(str, filttered_position_args)) + + # Add the chromosome argument the argument list + phase_call_args.append(chr_arg) + + # Call beagle wrapper + call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format) + + # Assign expected phased output filename + phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) + + # Rename output to phase_args.out, if specified + if phase_args.out: + shutil.move(phased_output, phase_args.out) + shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log') + # Rename log using phased_output + else: + shutil.move(phase_args.out_prefix + '.log', phased_output + '.log') + + logging.info('beagle log file created') + + # Assign general arguments and call shapeit + elif phase_args.phase_algorithm == 'shapeit': + + # Assign the shapeit burn in iter, if specified + if phase_args.shapeit_burn_iter: + phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter]) + + # Assign the shapeit prune iter, if specified + if phase_args.shapeit_prune_iter: + phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter]) + + # Assign the shapeit main iter, if specified + if phase_args.shapeit_main_iter: + phase_call_args.extend(['--main', phase_args.shapeit_main_iter]) + + # Assign the number of shapeit states if specified + if phase_args.shapeit_states: + phase_call_args.extend(['--states', phase_args.shapeit_states]) + + # Assign the shapeit window size, if specified + if phase_args.shapeit_window: + phase_call_args.extend(['--window', phase_args.shapeit_window]) + + # Assign the genetic map, if specified + if phase_args.genetic_map: + phase_call_args.extend(['--input-map', phase_args.genetic_map]) + + # Assign the effective pop size, if specified + if phase_args.Ne: + phase_call_args.extend(['--effective-size', phase_args.Ne]) + + # Assign the random seed, if specified + if phase_args.random_seed: + phase_call_args.extend(['--seed', phase_args.random_seed]) + + # Check if only a single shapeit run is required + if phase_args.phase_chr or len(chrs_in_vcf) == 1: + + # Holds shapeit input filename, as a temp file may be required + shapeit_input_vcf = None + + # Check if a single chromosome is selected + if phase_args.phase_chr and len(chrs_in_vcf) > 1: + # Assign the chromosome-specific input prefix + chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr + + # Assign the chr-specific file as the shapeit input filename + shapeit_input_vcf = chr_input_prefix + '.vcf.gz' + + # Create the chromosome-specific input + chr_subset_file(phase_args.vcf, + phase_args.phase_chr, + chr_input_prefix, + 'vcf.gz', + from_bp = phase_args.phase_from_bp, + to_bp = phase_args.phase_to_bp) + + logging.info('Chr %s subset created' % phase_args.phase_chr) + + else: + # Assign the shapeit input filename + shapeit_input_vcf = phase_args.vcf + + # Assigns the input and output arguments for shapeit + phase_call_args.extend(['--input-vcf', shapeit_input_vcf, + '--output-max', phase_args.out_prefix, + '--output-log', phase_args.out_prefix + '.phase.log']) + + # Assign the from bp position argument, if specified + if phase_args.phase_from_bp: + phase_call_args.extend(['--input-from', phase_args.phase_from_bp]) + + # Assign the to bp position argument, if specified + if phase_args.phase_to_bp: + phase_call_args.extend(['--input-to', phase_args.phase_to_bp]) + + # Call shapeit wrapper + call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format) + + # Assign expected phased output filename + phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) + + # Combine the log files + concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'], phased_output + '.log') + + # Rename output to phase_args.out, if specified + if phase_args.out: + shutil.move(phased_output, phase_args.out) + shutil.move(phased_output + '.log', phase_args.out + '.log') + + logging.info('shapeit log file created') + + # Check if a chr subset file was created + if phase_args.phase_chr and len(chrs_in_vcf) > 1: + # Delete the chromosome-specific input + os.remove(shapeit_input_vcf) + + logging.info('Chr %s subset deleted' % phase_args.phase_chr) + + # Remove intermediate files created by shapeit + remove_intermediate_files(phase_args.out_prefix) + + # Check if multiple shapeit runs are required + else: + + # List to store the phased filenames + phased_filename_list = [] + + # List to store the phased logs + phased_log_list = [] + + logging.info('Multi-chr shapeit phasing assigned') + + for chr_in_vcf in chrs_in_vcf: + + logging.info('Chr %s assigned' % chr_in_vcf) + + # Copy the arguments for this run + chr_call_args = copy.deepcopy(phase_call_args) + + # Assign the chromosome-specific output prefix + chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf + + # Assign the expected chromosome-specific output filename + chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format) + + # Store the output filename + phased_filename_list.append(chr_out_filename) + + # Assign the chromosome-specific input prefix + chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf + + # Assign the expected chromosome-specific input filename + chr_in_filename = chr_input_prefix + '.vcf.gz' + + # Create the chromosome-specific input + chr_subset_file(phase_args.vcf, + chr_in_vcf, + chr_input_prefix, + 'vcf.gz') + + # Assigns the input and output arguments for shapeit + chr_call_args.extend(['--input-vcf', chr_in_filename, + '--output-max', chr_out_prefix, + '--output-log', chr_out_prefix + '.phase.log']) + + # Call shapeit wrapper + call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format) + + # Combine log files + concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'], chr_out_filename + '.log') + + # Store the filename of the combined logs + phased_log_list.append(chr_out_filename + '.log') + + # Delete the chromosome-specific input + os.remove(chr_in_filename) + + # Remove intermediate files created by shapeit + remove_intermediate_files(chr_out_prefix) + + # Concatenate the vcf files + concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format) + + logging.info('Concatenated chromosomes') + + # Assign expected concatenated output filename + phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) + + # Combine the log files + concatenate_logs(phased_log_list, phased_output + '.log') + + # Rename output to phase_args.out, if specified + if phase_args.out: + shutil.move(phased_output, phase_args.out) + shutil.move(phased_output + '.log', phase_args.out + '.log') + + logging.info('Multi-chr shapeit log created') + + # Reverts the VCF input file + if vcfname_renamed: + os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)]) + +if __name__ == "__main__": + initLogger() + run()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_phase.xml Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,166 @@ +<tool id="vcf_phase" name="Phase VCF" version="1.0.0.1"> + + <description>files with BEAGLE or SHAPEIT</description> + + <requirements> + <requirement type="package" >pandas</requirement> + <requirement type="package" >pysam</requirement> + <requirement type="package" >shapeit</requirement> + <requirement type="package" >beagle</requirement> + </requirements> + + <command><![CDATA[ + #if $input.is_of_type('vcf_bgzip') + ln -fs $input input.vcf.gz && + #end if + #if $input.is_of_type('vcf') + ln -fs $input input.vcf && + #end if + python $__tool_directory__/vcf_phase.py + #if $input.is_of_type('vcf_bgzip') + --vcf input.vcf.gz + #end if + #if $input.is_of_type('vcf') + --vcf input.vcf + #end if + #if $model_file + --model-file $model_file + --model $model + #end if + --phase-algorithm $phase.phase_algorithm + #if $phase.beagle_burn_iter + --beagle-burn-iter $common.beagle_burn_iter + #end if + #if $phase.beagle_burn_iter + --beagle-burn-iter $phase.beagle_burn_iter + #end if + #if $phase.phase_algorithm == 'beagle' + --beagle-path $__tool_data_path__/shared/jars/ + #if $phase.beagle_iter + --beagle-iter $phase.beagle_iter + #end if + #if $phase.beagle_states + --beagle-states $phase.beagle_states + #end if + #if $phase.beagle_window + --beagle-window $phase.beagle_window + #end if + #if $phase.beagle_overlap + --beagle-overlap $phase.beagle_overlap + #end if + #if $phase.beagle_error + --beagle-error $phase.beagle_error + #end if + #if $phase.beagle_step + --beagle-step $phase.beagle_step + #end if + #if $phase.beagle_nsteps + --beagle-nsteps $phase.beagle_nsteps + #end if + #end if + #if $phase.phase_algorithm == 'shapeit' + #if $phase.shapeit_burn_iter + --shapeit-burn-iter $phase.shapeit_burn_iter + #end if + #if $phase.shapeit_prune_iter + --shapeit-prune-iter $phase.shapeit_prune_iter + #end if + #if $phase.shapeit_main_iter + --shapeit-main-iter $phase.shapeit_main_iter + #end if + #if $phase.shapeit_states + --shapeit-states $phase.shapeit_states + #end if + #if $phase.shapeit_window + --shapeit-window $phase.shapeit_window + #end if + #end if + #if $common.genetic_map + --genetic-map $common.genetic_map + #end if + #if $common.ne + --Ne $common.ne + #end if + #if $common.random_seed + --random-seed $common.random_seed + #end if + #if $common.phase_chr + --phase-chr $common.phase_chr + #end if + #if $common.phase_from_bp + --phase-from-bp $common.phase_from_bp + #end if + #if $common.phase_to_bp + --phase-to-bp $common.phase_to_bp + #end if + --out $output + --out-format $out_format + ]]></command> + + <inputs> + + <param format="vcf,vcf_bgzip" name="input" type="data" label="VCF Input"/> + + <param format="model" name="model_file" type="data" label="Model Input" optional="True"/> + <param name="model" type="select" label="Select Model" refresh_on_change="True"> + <options> + <filter type="data_meta" ref="model_file" key="models"/> + </options> + </param> + + <conditional name="phase"> + <param name="phase_algorithm" type="select" label="Phase Algorithm" refresh_on_change='True'> + <option value="beagle" selected="True" >Beagle</option> + <option value="shapeit">SHAPEIT</option> + </param> + <when value="beagle"> + <param name="beagle_burn_iter" type="integer" label="Burn-in iterations" optional="True"/> + <param name="beagle_iter" type="integer" label="Post burn-in iterations" optional="True"/> + <param name="beagle_states" type="integer" label="Model states for genotype estimation" optional="True"/> + <param name="beagle_window" type="float" label="Sliding window size (cM)" optional="True"/> + <param name="beagle_overlap" type="float" label="Overlap between neighboring sliding windows (cM)" optional="True"/> + <param name="beagle_error" type="float" label="HMM allele mismatch probability" optional="True"/> + <param name="beagle_step" type="float" label="Step length (cM)" optional="True" help="Used for identifying short IBS segments"/> + <param name="beagle_nsteps" type="integer" label="Number of consecutive steps" optional="True" help="Used for identifying long IBS segments"/> + </when> + <when value="shapeit"> + <param name="shapeit_burn_iter" type="integer" label="Burn-in iterations" optional="True"/> + <param name="shapeit_prune_iter" type="integer" label="Pruning iterations" optional="True"/> + <param name="shapeit_main_iter" type="integer" label="Main iterations" optional="True"/> + <param name="shapeit_states" type="integer" label="Conditioning states for haplotype estimation" optional="True"/> + <param name="shapeit_window" type="float" label="Model window size (Mb)" optional="True"/> + </when> + </conditional> + + <section name="common" title="Common Parameters" expanded="True"> + <param name="random_seed" type="integer" label="Seed value" optional="True"/> + <param format="text" name="genetic_map" type="data" label="Genetic Map" optional="True"/> + <param name="ne" type="integer" label="Effective population size" optional="True"/> + <param name="phase_chr" type="text" label="Chromosome to phase" optional="True"/> + <param name="phase_from_bp" type="integer" label="Lower bound of sites to include" help="Only usable with a single chromosome" optional="True"/> + <param name="phase_to_bp" type="integer" label="Upper bound of sites to include" help="Only usable with a single chromosome" optional="True"/> + </section> + + <param name="out_format" type="select" label="Output Format"> + <option value="vcf">VCF File</option> + <option value="vcf.gz" selected="True">bgzipped-VCF File</option> + <option value="bcf">BCF File</option> + </param> + + </inputs> + <outputs> + + <data name="output" format="vcf_bgzip"> + <change_format> + <when input="out_format" value="vcf" format="vcf"/> + <when input="out_format" value="bcf" format="bcf"/> + </change_format> + </data> + + </outputs> + + <help> + VCF Phaser Help Text + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_reader_func.py Mon Oct 15 18:15:47 2018 -0400 @@ -0,0 +1,474 @@ +import sys +import pysam +import logging +import struct +from random import sample +from collections import defaultdict +import os +import gzip + +def checkIfGzip(filename): + try: + gf = gzip.open(filename) + gl = gf.readline() + gf.close() + vcf_check = b'##fileformat=VCF' + if gl[0:3] == b'BCF': + return 'bcf' + elif gl[:len(vcf_check)] == vcf_check: + return checkHeader(filename) + else: + return 'other' + except: + return 'nozip' + +def checkHeader(filename): + f = open(filename,'rb') + l = f.readline() + f.close() + BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00' + #BGZF_HEADER=b'\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff' + GZF_HEADER=b'\x1f\x8b' + if l[:len(BGZF_HEADER)] == BGZF_HEADER: + return 'bgzip' + if l[:len(GZF_HEADER)] == GZF_HEADER: + return 'gzip' + return 'nozip' + +def checkFormat(vcfname): + """Checks header of given file for compression type + + + Given a filename, opens file and reads first line to check if + file has BGZF or GZIP header. May be extended to check for BCF format + + Parameters + ---------- + filename : str + Name of file to be checked + + Returns + ------- + extension : str {'bgzip','gzip','vcf','other'} + File extension as indicated by header + + """ + typ = checkIfGzip(vcfname) + if typ != 'nozip': + return typ + f = open(vcfname) + l = f.readline() + f.close() + VCF_TAG='##fileformat=VCF' + if l[:len(VCF_TAG)] == VCF_TAG: + return 'vcf' + return 'other' + +def checkIfCpG(record,fasta_ref,offset=0,add_chr=False): + dr = None + pos = record.pos + c = record.chrom + if record.alts is None: + return False + if add_chr: + c = 'chr'+record.chrom + if record.ref == 'C' and 'T' in record.alts: + seq = fasta_ref.fetch(c,pos-1,pos+1) + if seq[0].upper() != 'C': + logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[0])) + #raise Exception("checkIfCpG function not lining up properly") + if seq[1].upper() == 'G': + return True + return False + elif record.ref == 'G' and 'A' in record.alts: + seq = fasta_ref.fetch(c,pos-2,pos) + if seq[1].upper() != 'G': + logging.warning('%s %d has bad base %s' % (record.chrom,record.pos,seq[1])) + #raise Exception("checkIfCpg function not lining up on negative strand") + if seq[0].upper() == 'C': + return True + return False + return False + +def checkForDuplicates(rec_list,pass_list): + for i in range(len(rec_list)-1): + if rec_list[i].pos == rec_list[i+1].pos: + pass_list[i] = False + pass_list[i+1] = False + +def checkForMultiallele(rec_list,pass_list): + for i in range(len(rec_list)): + if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos: + pass_list[i] = False + pass_list[i+1] = False + if len(rec_list[i].alleles) > 2: + pass_list[i] = False + +def flipChrom(chrom): + if chrom[0:3] == 'chr': + return chrom[0:3] + return 'chr'+chrom + +def getAlleleCountDict(rec): + alleles = defaultdict(int) + total_sites = 0 + missing_inds = 0 + for j in range(len(rec.samples)): + samp = rec.samples[j] + if None in samp.alleles: + missing_inds += 1 + for k in range(len(samp.alleles)): + b = samp.alleles[k] + if b is not None: + alleles[b] += 1 + total_sites+=1 + return alleles, total_sites, missing_inds + +def isInformative(rec, mincount=2, alleles=None): + count = 0 + if alleles is None: + alleles, total_sites, missing_inds = getAlleleCountDict(rec) + if len(alleles) != 2: + return False + i1,i2 = alleles.keys() + return (alleles[i1] >= mincount and alleles[i2] >= mincount) + +def getPassSites(record_list, remove_cpg=False, remove_indels=True, + remove_multiallele=True, remove_missing=0, + inform_level=2, fasta_ref=None): + pass_list = [True for r in record_list] + if remove_cpg == True and fasta_ref is None: + raise Exception("CpG removal requires a reference") + if inform_level > 2 or inform_level < 0: + raise Exception("Inform level %d must be between 0 and 2" % inform_level) + if remove_multiallele: + checkForMultiallele(record_list,pass_list) + for i in range(len(record_list)): + rec = record_list[i] + logging.info(rec.pos) + if remove_indels and not checkRecordIsSnp(rec): + pass_list[i] = False + if remove_cpg and checkIfCpG(rec,fasta_ref): + pass_list[i] = False + alleles,total_sites,missing_inds = getAlleleCountDict(rec) + if remove_missing != -1 and missing_inds > remove_missing: + pass_list[i] = False + if inform_level != 0 and not isInformative(rec,mincount=inform_level,alleles=alleles): + pass_list[i] = False + #pp = zip([r.pos for r in record_list],pass_list) + #for ppp in pp: + # logging.info(ppp) + return pass_list + + +def filterSites(record_list, remove_cpg=False, remove_indels=True, + remove_multiallele=True, remove_missing=0, inform_level=2, + fasta_ref=None): + pass_list = getPassSites(record_list,remove_cpg,remove_indels,remove_multiallele,remove_missing,inform_level,fasta_ref) + out_list = [] + for i in range(len(pass_list)): + if pass_list[i]: + out_list.append(record_list[i]) + return out_list + + + +class VcfReader(): + def __init__(self, vcfname, compress_flag=False, subsamp_num=None, + subsamp_fn=None, subsamp_list=None, index=None, popmodel=None, use_allpop=False): + + ext = checkFormat(vcfname) + if ext in ['gzip','other'] : + raise Exception(('Input file %s is gzip-formatted, must be either ' + 'uncompressed or zipped with bgzip' % vcfname)) + self.file_uncompressed = (ext == 'vcf') + self.reader_uncompressed = (self.file_uncompressed and not compress_flag) + self.popmodel = None + self.popkeys = None + if popmodel is not None and use_allpop: + raise Exception("Popmodel and allpop cannot both be specified") + if compress_flag and file_uncompressed: + vcfname = compressVcf(vcfname) + if subsamp_num is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_list = getSubsampleList(vcfname, subsamp_num) + elif subsamp_fn is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_file = open(subsamp_fn,'r') + subsamp_list = [l.strip() for l in subsamp_file.readlines()] + subsamp_file.close() + + if index is None: + self.reader = pysam.VariantFile(vcfname) + else: + self.reader = pysam.VariantFile(vcfname, index_filename=index) + if popmodel is not None: + self.popmodel = popmodel + popsamp_list = popmodel.inds + self.reader.subset_samples(popsamp_list) + self.setPopIdx() + if use_allpop: + self.setAllPop() + if subsamp_list is not None: + logging.debug('Subsampling %d individuals from VCF file' % + (len(subsamp_list))) + self.reader.subset_samples(subsamp_list) + self.prev_last_rec = next(self.reader) + self.chr_in_chrom = (self.prev_last_rec.chrom[0:3] == 'chr') + + def fetch(self, chrom=None, start=None, end=None): + return self.reader.fetch(chrom, start, end) + + def getRecordList(self, region=None, chrom=None, start=None, + end=None): + if self.reader_uncompressed: + ret, self.prev_last_rec = getRecordListUnzipped(self.reader, self.prev_last_rec, region, add_chr=self.chr_in_chrom) + return ret + else: + return getRecordList(self.reader, region, chrom, start, end, self.chr_in_chrom) + + def setPopIdx(self): + self.popkeys = {} + sample_names = [l for l in self.reader.header.samples] + for p in self.popmodel.pop_list: + self.popkeys[p] = [] + for ind in self.popmodel.ind_dict[p]: + self.popkeys[p].append(sample_names.index(ind)) + + def close(self): + self.reader.close() + + def setAllPop(self): + self.popkeys = {'ALL':[]} + for i in range(len(self.reader.header.samples)): + self.popkeys['ALL'].append(i) + +def modChrom(c,vcf_chr): + if c is None: + return None + if vcf_chr and c[:3] != 'chr': + return 'chr'+c + if not vcf_chr and c[:3] == 'chr': + return c[3:] + return c + +def getRecordList(vcf_reader, region=None, chrom=None, start=None, + end=None, add_chr=False): + """Returns list for use in subsampling from input file""" + if region is not None: + c = modChrom(region.chrom,add_chr) + var_sites = vcf_reader.fetch(c, region.start, region.end) + else: + c = modChrom(chrom,add_chr) + var_sites = vcf_reader.fetch(c, start, end) + lst = [] + for rec in var_sites: + lst.append(rec) + return lst + + +def getRecordListUnzipped(vcf_reader, prev_last_rec, region=None, chrom=None, + start=None, end=None, add_chr=False): + """Method for getting record list from unzipped VCF file. + + This method will sequentially look through a VCF file until it finds + the given `start` position on `chrom`, then add all records to a list + until the `end` position has been reached. Note that `prev_last_rec` + must be kept track of externally to ensure that if consecutive regions + are called, the record of the first variant outside the first region + is not lost. + + Parameters + ---------- + vcf_reader : pysam VariantFile object + VCF reader initialized from other function + region : region object + Region object with start and end coordinates of region of interest. + prev_last_rec : VariantRecord object + Variable with last record read from VcfReader. Stored here so that + if two adjacent regions are called, the overflow record from the + first region is still included in the next region + + Returns + ------- + lst : list + List of records in given gene region + prev_last_rec : VariantRecord object + First record after target region, for use in next call + + """ + lst = [] + if region is None: + lst.append(prev_last_rec) + for rec in vcf_reader: + lst.append(rec) + return lst, lst[-1] + + if (prev_last_rec is not None and + region.containsRecord(prev_last_rec) == 'in'): + lst.append(prev_last_rec) + elif (prev_last_rec is not None and + region.containsRecord(prev_last_rec) == 'after'): + return [] + rec = next(vcf_reader,None) + if rec is None: + return lst,None + place = region.containsRecord(rec) + while rec is not None and place != 'after': + if place == 'in': + lst.append(rec) + rec = next(vcf_reader,None) + if rec is None: + break + place = region.containsRecord(rec) + prev_last_rec = rec + return lst, prev_last_rec + + +def checkRecordIsSnp(rec): + """Checks if this record is a single nucleotide variant, returns bool.""" + if len(rec.ref) != 1: + return False + if rec.alts is None: + return False + for allele in rec.alts: + if len(allele) != 1: + return False + return True + + +def getSubsampleList(vcfname, ss_count): + """Returns a list of the first `ss_count` individuals in `vcfname` + + """ + + vcf_o = pysam.VariantFile(vcfname) + rec = next(vcf_o) + vcf_o.close() + lst = [] + for samp in rec.samples: + lst.append(samp) + return lst[:int(ss_count)] + + +def compressVcf(vcfname,forceflag=False,remove=False): + """Runs bgzip and tabix on input VCF file. + + Using the pysam library, this function runs the bgzip and tabix utilities + on the given input file. By default, this will not overwrite an existing + zipped file, but will overwrite an existing index. `remove` can be set to + delete the unzipped file. + + Parameters + ---------- + vcfname : str + Name of uncompressed VCF file + forceflag : bool (False) + If true, will overwrite (vcfname).gz if it exists + remove : bool (False) + If true, will delete uncompressed source file + + Returns + ------- + cvcfname : str + Filepath to compressed VCF file + """ + cvcfname = vcfname+".gz" + pysam.tabix_compress(vcfname,cvcfname,force=forceflag) + pysam.tabix_index(cvcfname,preset="vcf",force=True) + if remove: + os.remove(vcfname) + return cvcfname + +def vcfRegionName(prefix, region, ext, oneidx=False, + halfopen=True, sep='-'): + chrom = region.toStr(halfopen, oneidx, sep) + return prefix+'_'+chrom+'.'+ext + +def getRecordsInRegion(region, record_list): + sub_list = [] + for i in range(len(record_list)): + loc = region.containsRecord(record_list[i]) + if loc == "in": + sub_list.append(record_list[i]) + elif loc == "after": + break + return sub_list + + + + + +#def getVcfReader(args): +def getVcfReader(vcfname, compress_flag=False, subsamp_num=None, + subsamp_fn=None, subsamp_list=None, index=None): + """Returns a reader for a given input VCF file. + + Given a filename, filetype, compression option, and optional Subsampling + options, will return a pysam.VariantFile object for iteration and + a flag as to whether this file is compressed or uncompressed. + + Parameters + ---------- + vcfname : str + Filename for VCF file. The extension of this file will be used to + determine whether it is compressed or not unless `var_ext` is set. + var_ext : str (None) + Extension for VCF file if it is not included in the filename. + compress_flag : bool (False) + If filetype is uncompressed and this is set to true, will run + compressVcf function. + subsamp_num : int (None) + If set, will randomly select `subsamp_num` individuals (not + genotypes) from the input VCF file and return a reader with + only those data. + subsamp_fn : str (None) + If set, will return a reader with only data from the samples listed + in the file provided. Cannot be used with other subsampling options. + subsamp_list : list (None) + If set, will return reader with records containing only + individuals named in the list. Cannot be used with other subsampling + options. + + Returns + ------- + vcf_reader : pysam.VariantFile + A reader that can be iterated through for variant records. If + compressed, it will be able to use the pysam fetch method, otherwise + it must be read through sequentially + reader_uncompressed : bool + If True, VCF reader is uncompressed. This means the fetch method + cannot be used and region access must be done using the + "getRecordListUnzipped" method. + + """ + ext = checkFormat(vcfname) + if ext in ['gzip','other'] : + raise Exception(('Input file %s is gzip-formatted, must be either ' + 'uncompressed or zipped with bgzip' % vcfname)) + file_uncompressed = (ext == 'vcf') + reader_uncompressed = (file_uncompressed and not compress_flag) + if compress_flag and file_uncompressed: + vcfname = compressVcf(vcfname) + #subsamp_list = None + if subsamp_num is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_list = getSubsampleList(vcfname, subsamp_num) + elif subsamp_fn is not None: + if subsamp_list is not None: + raise Exception('Multiple subsampling options called in getVcfReader') + subsamp_file = open(subsamp_fn,'r') + subsamp_list = [l.strip() for l in subsamp_file.readlines()] + subsamp_file.close() + if index is None: + vcf_reader = pysam.VariantFile(vcfname) + else: + vcf_reader = pysam.VariantFile(vcfname, index_filename=index) + if subsamp_list is not None: + logging.debug('Subsampling %d individuals from VCF file' % + (len(subsamp_list))) + vcf_reader.subset_samples(subsamp_list) + return vcf_reader, reader_uncompressed