# HG changeset patch # User jaredgk # Date 1539812074 14400 # Node ID 86a9d8d5b29110461f060305b6da0abf7117774d # Parent 901857c9b24f0c58ac703dddf9fb0d02c13066aa Uploaded diff -r 901857c9b24f -r 86a9d8d5b291 bcftools.py --- a/bcftools.py Wed Oct 17 17:30:37 2018 -0400 +++ b/bcftools.py Wed Oct 17 17:34:34 2018 -0400 @@ -7,6 +7,31 @@ from vcf_reader_func import checkFormat +def return_output_format_args (output_format): + ''' + Return bcftools arguments for output format + + Parameters + ---------- + output_format : str + The specified output format + + Raises + ------ + Exception + If output format is unsupported by bcftools + ''' + + # Return the output format arguments + if output_format == 'vcf': + return ['-O', 'v'] + elif output_format == 'bcf': + return ['-O', 'b'] + elif output_format == 'vcf.gz': + return ['-O', 'z'] + else: + raise Exception('Unsupported file format') + def check_bcftools_for_errors (bcftools_stderr): ''' Checks the bgzip stderr for errors @@ -18,28 +43,176 @@ Raises ------ - IOError + Exception If bcftools stderr returns an error ''' # Expand as errors are discovered - if bcftools_stderr: - logging.error(vcftools_stderr) - raise Exception(vcftools_stderr) + + # Log warning messages + if 'W::' in bcftools_stderr: + logging.warning(bcftools_stderr.strip()) + + # Report errors that are not warnings + elif bcftools_stderr: + raise Exception(bcftools_stderr) + +def pipe_bcftools (bcftools_call_args): + ''' + Calls bcftools with pipe output + + The output of this function is the stdout and stderr of bcftools. This + function should only be used if bcftools is being used as the stdin of + another function. Please note that this function does not check the for + errors in the bcftools call. Please check for errors after the call is + closed using check_bcftools_for_errors. + + Parameters + ---------- + bcftools_stderr : str + bcftools stderr + + Returns + ------- + bcftools_call : PIPE + Pipe of subprocess call, including both stdout and stderr + + ''' + + # bcftools subprocess call + bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + return bcftools_call + +def pipe_bcftools_to_chr (vcf_filename): + ''' + Pipes chromosome and/or contig output of bcftools to a list of unique + entries + + The purpose of this function is to return a list of the unique + chromosomes and/or contigs for use in other functions. + + Parameters + ---------- + vcf_filename : str + VCF input + + Returns + ------- + chromosomes_to_return : list + Unique chromosomes and/or contigs within VCF input + ''' + + # Open bcftools pipe + bcftools_call = pipe_bcftools(['query', '-f', '%CHROM\n', vcf_filename]) + + # Create a set to hold unique chromosome + chromosomes_to_return = set() + + try: + + # Current chromosomes/contigs, reduces duplicates if VCF is sorted + previous_chr = None + + # Iterate the bcftools stdout unless error occurs + for bcftools_stdout_line in iter(bcftools_call.stdout.readline, b''): + # Remove the newline character + bcftools_line_chr = bcftools_stdout_line.strip() + # Check if the bcftools bcftools chr is different from stored chr + if bcftools_line_chr != previous_chr: + # Store the new chr for comparisons to reduce duplicates + previous_chr = bcftools_line_chr + # Save the chr + chromosomes_to_return.add(bcftools_line_chr) + + except: + raise Exception('bcftools call error') + + # Close the bcftools stdout + bcftools_call.stdout.close() + + # Wait for bctools to finish + bcftools_call.wait() + + # Read the bcftools stderr + bcftools_stderr = bcftools_call.stderr.read() + + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + bcftools_stderr = bcftools_stderr.decode() + + # Check that the log file was created correctly + check_bcftools_for_errors(bcftools_stderr) + + logging.info('bcftools call complete') + + return list(chromosomes_to_return) def call_bcftools (bcftools_call_args): + ''' + Calls bcftools + + The function calls bcftools. + + Parameters + ---------- + bcftools_call_args : list + bcftools arguments + + Returns + ------- + vcftools_err : str + vcftools log output + + Raises + ------ + Exception + If bcftools stderr returns an error + ''' # bcftools subprocess call bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Wait for bcftools to finish - bcftools_out, bcftools_err = bcftools_call.communicate() + bcftools_stdout, bcftools_stderr = bcftools_call.communicate() - check_bcftools_for_errors(bcftools_err) + # Check if code is running in python 3 + if sys.version_info[0] == 3: + # Convert bytes to string + bcftools_stderr = bcftools_stderr.decode() + + check_bcftools_for_errors(bcftools_stderr) logging.info('bcftools call complete') + return bcftools_stderr + def check_for_index (filename): + ''' + Checks for index file + + If the file is capable of having an index (i.e. bgzipped-VCF or BCF) the + function will return either True (i.e. index found) or False. However, + if the file is a VCF the function will return None (as VCF files cannot + have an index). An error is returned if the file is either a + gzipped-VCF file or not a VCF-based format. + + Parameters + ---------- + filename : str + Filename of VCF-formatted file + + Returns + ------- + bool, None + Returns bool for VCF.GZ and BCF files. Returns None for VCF files + + Raises + ------ + Exception + If the file is a gzipped-VCF or of an unknown format + ''' # Assign the file format file_format = checkFormat(filename) @@ -56,10 +229,92 @@ if os.path.isfile(filename + '.csi'): return True + # Check if the file is vcf (does not need an index) + elif file_format == 'vcf': + return None + + # Check if the file is gzip-compressed vcf (cannot have an index) + elif file_format == 'gzip': + raise Exception('GZIP-compressed VCF files do not support index files.') + + # Check if the file is an unknown format + else: + raise Exception('Unknown file format') + # Return false if no index is found return False +def delete_index (filename): + ''' + Deletes an index file + + If the file is capable of having an index (i.e. bgzipped-VCF or BCF) + this function will delete the index. However, if the file is either a + VCF or a gzip-compressed VCF the function will return an error. The + function also results in an error if the index cannot be found. This + function should be used following check_for_index. + + Parameters + ---------- + filename : str + Filename of VCF-formatted file + + Raises + ------ + Exception + No index file could be found + Exception + If the file is a gzipped-VCF or a VCF + ''' + + # Assign the file format + file_format = checkFormat(filename) + + # Check if the file to be indexed is a vcf.gz + if file_format == 'bgzip': + # Check if the index (.tbi) exists + if os.path.isfile(filename + '.tbi'): + # Delete the index + os.remove(filename + '.tbi') + return + + # Check if the file to be indexed is a bcf + elif file_format == 'bcf': + # Check if the index (.csi) exists + if os.path.isfile(filename + '.csi'): + # Delete the index + os.remove(filename + '.csi') + return + + # Check if the file is vcf (cannot have an index) + elif file_format == 'vcf': + raise Exception('VCF format does not support index files.') + + # Check if the file is gzip-compressed vcf (cannot have an index) + elif file_format == 'gzip': + raise Exception('GZIP-compressed VCF files do not support index files.') + + # Return error if no index is found + raise Exception('No index file found.') + def create_index (filename): + ''' + Creates an index file + + If the file is capable of having an index (i.e. bgzipped-VCF or BCF) + this function will create an index file. However, if the file is a + different format the function will return an error. + + Parameters + ---------- + filename : str + Filename of VCF-formatted file + + Raises + ------ + Exception + If the file is not a bgzipped-VCF or BCF + ''' # Assign the file format file_format = checkFormat(filename) @@ -78,7 +333,135 @@ else: raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) -def convert_to_bcf (filename, output_prefix): +def chr_subset_file (filename, chromosome, output_prefix, output_format, from_bp = None, to_bp = None): + ''' + Creates chromosome subset + + This function is used to create a VCF-formatted subset with only + the data from a single chromosome. + + Parameters + ---------- + filename : str + Filename of VCF-formatted input + chromosome : str + Chromosome to subset + output_prefix : str + Prefix of the VCF-formatted output (i.e. without file extension) + output_format : str + The format of the output (e.g. vcf, bcf, vcf.gz) + from_bp : int, optional + Lower bound of sites to include + to_bp : int, optional + Upper bound of sites to include + ''' + + # Creates a list to the arguments and store the bcftools call + subset_args = ['view'] + + # Assign the output format arguments + output_format_args = return_output_format_args(output_format) + + # Store the output format arguments + subset_args.extend(output_format_args) + + # Stores the specified output filename + vcf_output = '%s.%s' % (output_prefix, output_format) + + # Assigns the output file to the arguments + subset_args.extend(['-o', vcf_output]) + + # Holds the subset argument + chr_subet_arg = chromosome + + # Check if either bp position arguments were specified + if from_bp or to_bp: + + # List of the position arguments, in their required order + position_args = [':', from_bp, '-', to_bp] + + # Filter the position arguments to remove empty values + filttered_position_args = filter(None, position_args) + + # Map the arguments to str and add them to the chromosome argument + chr_subet_arg += ''.join(map(str, filttered_position_args)) + + # Checks if the input file has an index, then subset to the arguments + if check_for_index(filename): + # Subsets using the index + subset_args.extend(['-r', chr_subet_arg]) + else: + # Subsets using the stdout + subset_args.extend(['-t', chr_subet_arg]) + + # Assigns the input file to the arguments + subset_args.append(filename) + + # Call bcftools + call_bcftools(subset_args) + +def concatenate (filenames, output_prefix, output_format, keep_original = False): + ''' + Concatenate multiple VCF-formatted files + + This function will concatenate multiple VCF-formatted files into a + single VCF-formatted file of the specifed format. + + Parameters + ---------- + filenames : list + List of VCF-formatted input filenames + output_prefix : str + Prefix of the VCF-formatted output (i.e. without file extension) + output_format : str + The format of the output (e.g. vcf, bcf, vcf.gz) + ''' + + # Holds the arguments to convert to VCF format + concat_args = ['concat'] + + # Assign the output format arguments + output_format_args = return_output_format_args(output_format) + + # Store the output format arguments + concat_args.extend(output_format_args) + + # Stores the specified output filename + vcf_output = '%s.%s' % (output_prefix, output_format) + + # Assigns the output file to the arguments + concat_args.extend(['-o', vcf_output]) + + # Assigns the input files to merge + concat_args.extend(filenames) + + # Call bcftools + call_bcftools(concat_args) + + # Delete the original files once the merged file is created + if not keep_original: + for filename in filenames: + if check_for_index(filename) == True: + delete_index(filename) + os.remove(filename) + +def convert_to_bcf (filename, output_prefix, keep_original = False): + ''' + Converts a VCF-formatted file to BCF + + This function will convert a VCF-formatted file to BCF with the + specified filename prefix. The function also has the option to keep or + delete the input file once the BCF file has been created. + + Parameters + ---------- + filename : str + Filename of VCF-formatted input + output_prefix : str + Prefix of the BCF output (i.e. without file extension) + keep_original : bool, optional + If the input file should be kept once converted + ''' # Holds the arguments to convert to BCF format convert_args = ['convert', '-O', 'b'] @@ -95,8 +478,29 @@ # Call bcftools call_bcftools(convert_args) + # Delete the original file once the bcf file is created + if not keep_original: + if check_for_index(filename) == True: + delete_index(filename) + os.remove(filename) -def convert_to_vcf (filename, output_prefix): +def convert_to_vcf (filename, output_prefix, keep_original = False): + ''' + Converts a VCF-formatted file to VCF + + This function will convert a VCF-formatted file to VCF with the + specified filename prefix. The function also has the option to keep or + delete the input file once the VCF file has been created. + + Parameters + ---------- + filename : str + Filename of VCF-formatted input + output_prefix : str + Prefix of the VCF output (i.e. without file extension) + keep_original : bool, optional + If the input file should be kept once converted + ''' # Holds the arguments to convert to VCF format convert_args = ['view', '-O', 'v'] @@ -113,7 +517,29 @@ # Call bcftools call_bcftools(convert_args) -def convert_to_vcfgz (filename, output_prefix): + # Delete the original file once the vcf file is created + if not keep_original: + if check_for_index(filename) == True: + delete_index(filename) + os.remove(filename) + +def convert_to_vcfgz (filename, output_prefix, keep_original = False): + ''' + Converts a VCF-formatted file to bgzipped-VCF + + This function will convert a VCF-formatted file to bgzipped-VCF with the + specified filename prefix. The function also has the option to keep or + delete the input file once the bgzipped-VCF file has been created. + + Parameters + ---------- + filename : str + Filename of VCF-formatted input + output_prefix : str + Prefix of the bgzipped-VCF output (i.e. without file extension) + keep_original : bool, optional + If the input file should be kept once converted + ''' # Holds the arguments to convert to VCFGZ format convert_args = ['view', '-O', 'z'] @@ -129,3 +555,9 @@ # Call bcftools call_bcftools(convert_args) + + # Delete the original file once the vcfgz file is created + if not keep_original: + if check_for_index(filename) == True: + delete_index(filename) + os.remove(filename)