Mercurial > repos > jaredgk > ppp_vcfphase
comparison vcf_phase.py @ 0:3830d29fca6a draft
Uploaded
| author | jaredgk | 
|---|---|
| date | Mon, 15 Oct 2018 18:15:47 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:3830d29fca6a | 
|---|---|
| 1 import os | |
| 2 import sys | |
| 3 import copy | |
| 4 import shutil | |
| 5 import argparse | |
| 6 import glob | |
| 7 import logging | |
| 8 | |
| 9 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) | |
| 10 | |
| 11 from vcf_reader_func import checkFormat | |
| 12 from logging_module import initLogger, logArgs | |
| 13 from model import read_model_file | |
| 14 from beagle import call_beagle | |
| 15 from shapeit import call_shapeit, remove_intermediate_files | |
| 16 from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate | |
| 17 | |
| 18 def phase_argument_parser(passed_arguments): | |
| 19 '''Phase Argument Parser - Assigns arguments for vcftools from command line. | |
| 20 Depending on the argument in question, a default value may be specified''' | |
| 21 | |
| 22 def parser_confirm_file (): | |
| 23 '''Custom action to confirm file exists''' | |
| 24 class customAction(argparse.Action): | |
| 25 def __call__(self, parser, args, value, option_string=None): | |
| 26 if not os.path.isfile(value): | |
| 27 raise IOError('%s not found' % value) | |
| 28 setattr(args, self.dest, value) | |
| 29 return customAction | |
| 30 | |
| 31 def metavar_list (var_list): | |
| 32 '''Create a formmated metavar list for the help output''' | |
| 33 return '{' + ', '.join(var_list) + '}' | |
| 34 | |
| 35 phase_parser = argparse.ArgumentParser() | |
| 36 | |
| 37 # Input arguments | |
| 38 phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file()) | |
| 39 | |
| 40 # Model file arguments | |
| 41 phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file()) | |
| 42 phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str) | |
| 43 | |
| 44 # General arguments | |
| 45 phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true') | |
| 46 phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/') | |
| 47 | |
| 48 # Phase algorithm argument | |
| 49 phasing_list = ['beagle', 'shapeit'] | |
| 50 phasing_default = 'beagle' | |
| 51 phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default) | |
| 52 | |
| 53 # Common phasing arguments | |
| 54 phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int) | |
| 55 phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int) | |
| 56 phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file()) | |
| 57 phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str) | |
| 58 phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int) | |
| 59 phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int) | |
| 60 | |
| 61 # Shapeit-specific options | |
| 62 phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int) | |
| 63 phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int) | |
| 64 phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int) | |
| 65 phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int) | |
| 66 phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float) | |
| 67 | |
| 68 # Beagle-specific options | |
| 69 phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int) | |
| 70 phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int) | |
| 71 phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int) | |
| 72 phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float) | |
| 73 phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float) | |
| 74 phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float) | |
| 75 phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float) | |
| 76 phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int) | |
| 77 | |
| 78 # Output arguments | |
| 79 phase_parser.add_argument('--out', help = 'Defines the output filename') | |
| 80 phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out') | |
| 81 | |
| 82 out_format_list = ['vcf', 'vcf.gz', 'bcf'] | |
| 83 out_format_default = 'vcf.gz' | |
| 84 | |
| 85 phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default) | |
| 86 | |
| 87 if passed_arguments: | |
| 88 return phase_parser.parse_args(passed_arguments) | |
| 89 else: | |
| 90 return phase_parser.parse_args() | |
| 91 | |
| 92 def concatenate_logs (log_files, new_log_filename): | |
| 93 ''' | |
| 94 Concatenates log files | |
| 95 | |
| 96 This function is used to concatenate log files. If a single chromosome | |
| 97 is being analyzed, the function is used to concatenate the log files | |
| 98 from multiple wrappers. If multiple chromosomes are being analyzed the | |
| 99 function is used to concatenate the logs of each chromosome. | |
| 100 | |
| 101 Raises | |
| 102 ------ | |
| 103 IOError | |
| 104 If a log file does not exist | |
| 105 ''' | |
| 106 | |
| 107 # Open the new log file | |
| 108 with open(new_log_filename, 'wb') as out_file: | |
| 109 # Loop the logs to be combined | |
| 110 for log_file in log_files: | |
| 111 # Confirm the log file exists, raise an error if not | |
| 112 if not os.path.isfile(log_file): | |
| 113 raise IOError('%s not found' % log_file) | |
| 114 # Open the log file | |
| 115 with open(log_file, 'rb') as in_file: | |
| 116 # Copy the contents of the file | |
| 117 shutil.copyfileobj(in_file, out_file) | |
| 118 | |
| 119 # Remove old log files | |
| 120 for log_file in log_files: | |
| 121 os.remove(log_file) | |
| 122 | |
| 123 def assign_vcf_extension (filename): | |
| 124 # Checks if the file is unzipped, bgzipped, or gzipped | |
| 125 vcfname_format = checkFormat(filename) | |
| 126 | |
| 127 if vcfname_format == 'vcf': | |
| 128 return '.vcf' | |
| 129 | |
| 130 elif vcfname_format == 'gzip' or vcfname_format == 'bgzip': | |
| 131 return '.vcf.gz' | |
| 132 | |
| 133 elif vcfname_format == 'bcf': | |
| 134 raise Exception('BCF not supported in current version') | |
| 135 | |
| 136 else: | |
| 137 raise Exception('Unknown file format') | |
| 138 | |
| 139 def run (passed_arguments = []): | |
| 140 ''' | |
| 141 Phaser for VCF files. | |
| 142 | |
| 143 Automates the phasing process for a specified VCF file. The function | |
| 144 allows users to select between multiple phasing algorithms: beagle | |
| 145 (default) and shapit. | |
| 146 | |
| 147 Parameters | |
| 148 ---------- | |
| 149 --vcf : str | |
| 150 Specifies the input VCF filename | |
| 151 --phase-algorithm : str | |
| 152 Specifies the algorithm to be used. Choices: beagle (default) and | |
| 153 shapit | |
| 154 --out : str | |
| 155 Specifies the output filename | |
| 156 | |
| 157 Returns | |
| 158 ------- | |
| 159 output : file | |
| 160 Phased VCF file | |
| 161 | |
| 162 Raises | |
| 163 ------ | |
| 164 IOError | |
| 165 Input VCF file does not exist | |
| 166 IOError | |
| 167 Output file already exists | |
| 168 | |
| 169 ''' | |
| 170 | |
| 171 # Grab VCF arguments from command line | |
| 172 phase_args = phase_argument_parser(passed_arguments) | |
| 173 | |
| 174 # Adds the arguments (i.e. parameters) to the log file | |
| 175 logArgs(phase_args, func_name = 'vcf_phase') | |
| 176 | |
| 177 # Assign file extension for VCF input file | |
| 178 vcfname_ext = assign_vcf_extension(phase_args.vcf) | |
| 179 | |
| 180 logging.info('Input file assigned') | |
| 181 | |
| 182 # Used to confirm if the VCF input was renamed | |
| 183 vcfname_renamed = False | |
| 184 | |
| 185 # Confirm input has correct file extension | |
| 186 if vcfname_ext not in phase_args.vcf: | |
| 187 vcfname_renamed = True | |
| 188 os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext) | |
| 189 phase_args.vcf += vcfname_ext | |
| 190 | |
| 191 # List to hold beagle/shapeit arguments | |
| 192 phase_call_args = [] | |
| 193 | |
| 194 # bool to check if an eclude file was created | |
| 195 exclude_file_created = False | |
| 196 | |
| 197 # bool to check if an include file was created | |
| 198 include_file_created = False | |
| 199 | |
| 200 # Performs actions related to --model-file and argument assignment | |
| 201 if phase_args.model_file: | |
| 202 | |
| 203 # Check if a model has been specified | |
| 204 if not phase_args.model: | |
| 205 raise IOError('No selected model. Please use --model to select a model') | |
| 206 | |
| 207 # Read in the model file | |
| 208 models_file = read_model_file(phase_args.model_file) | |
| 209 | |
| 210 # Check that the selected model was found in the file | |
| 211 if phase_args.model not in models_file: | |
| 212 raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file)) | |
| 213 | |
| 214 logging.info('Model assigned') | |
| 215 | |
| 216 if phase_args.phase_algorithm == 'beagle': | |
| 217 | |
| 218 # Get list of individuals to include | |
| 219 inds_in_model = models_file[phase_args.model].inds | |
| 220 | |
| 221 # Create file with individuals to exclude (beagle has no include option) | |
| 222 models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite) | |
| 223 | |
| 224 # Assign exclude file | |
| 225 phase_call_args.append('excludesamples=' + models_file.exclude_file) | |
| 226 | |
| 227 # Confirm that the exclude file was created | |
| 228 exclude_file_created = True | |
| 229 | |
| 230 elif phase_args.phase_algorithm == 'shapeit': | |
| 231 | |
| 232 # Assign the model | |
| 233 selected_model = models_file[phase_args.model] | |
| 234 | |
| 235 # Create file with individuals to exclude (beagle has no include option) | |
| 236 selected_model.create_ind_file(overwrite = phase_args.overwrite) | |
| 237 | |
| 238 # Assign exclude file | |
| 239 phase_call_args.extend(['--include-ind', selected_model.ind_file]) | |
| 240 | |
| 241 # Confirm that the include file was created | |
| 242 include_file_created = True | |
| 243 | |
| 244 # Get the list of chromosomes within the VCF | |
| 245 chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf) | |
| 246 | |
| 247 # Check if the user specified a specific chromosome | |
| 248 if phase_args.phase_chr: | |
| 249 # Check if the chromosome is within the file | |
| 250 if phase_args.phase_chr not in chrs_in_vcf: | |
| 251 raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf)) | |
| 252 | |
| 253 # Check that a chr was specified if a bp-based argument was specified | |
| 254 if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr: | |
| 255 # Check if the file has a single chromosome | |
| 256 if len(chrs_in_vcf) == 1: | |
| 257 # Check if the beagle algorithm is being used | |
| 258 if phase_args.phase_algorithm == 'beagle': | |
| 259 # Assign the chromosome, as beagle requires it to be assigned | |
| 260 phase_args.phase_chr = chrs_in_vcf[0] | |
| 261 | |
| 262 logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0]) | |
| 263 # Report error if multiple chromosomes are within the file | |
| 264 else: | |
| 265 raise Exception('The --phase-from-bp and --phase-to-bp arguments ' | |
| 266 'require the --phase-chr argument to function if ' | |
| 267 'multiple chrs are within %s' % phase_args.vcf) | |
| 268 | |
| 269 # Assign general arguments and call beagle | |
| 270 if phase_args.phase_algorithm == 'beagle': | |
| 271 | |
| 272 # Assigns the input and output arguments | |
| 273 phase_call_args.extend(['gt=' + phase_args.vcf, | |
| 274 'out=' + phase_args.out_prefix]) | |
| 275 | |
| 276 # Assign the burn-in iter, if specified | |
| 277 if phase_args.beagle_burn_iter: | |
| 278 phase_call_args.append('burnin=' + phase_args.beagle_burn_iter) | |
| 279 | |
| 280 # Assign the iter, if specified | |
| 281 if phase_args.beagle_iter: | |
| 282 phase_call_args.append('iterations=' + phase_args.beagle_iter) | |
| 283 | |
| 284 # Assign the state count, if specified | |
| 285 if phase_args.beagle_states: | |
| 286 phase_call_args.append('phase-states=' + phase_args.beagle_states) | |
| 287 | |
| 288 # Assign the window length, if specified | |
| 289 if phase_args.beagle_window: | |
| 290 phase_call_args.append('window=' + phase_args.beagle_window) | |
| 291 | |
| 292 # Assign the window overlap, if specified | |
| 293 if phase_args.beagle_overlap: | |
| 294 phase_call_args.append('overlap=' + phase_args.beagle_overlap) | |
| 295 | |
| 296 # Assign the HMM error, if specified | |
| 297 if phase_args.beagle_error: | |
| 298 phase_call_args.append('err=' + phase_args.beagle_error) | |
| 299 | |
| 300 # Assign the step length, if specified | |
| 301 if phase_args.beagle_step: | |
| 302 phase_call_args.append('step=' + phase_args.beagle_step) | |
| 303 | |
| 304 # Assign the number of steps, if specified | |
| 305 if phase_args.beagle_nsteps: | |
| 306 phase_call_args.append('nsteps=' + phase_args.beagle_nsteps) | |
| 307 | |
| 308 # Assign the genetic map, if specified | |
| 309 if phase_args.genetic_map: | |
| 310 phase_call_args.append('map=' + phase_args.genetic_map) | |
| 311 | |
| 312 # Assign the effective pop size, if specified | |
| 313 if phase_args.Ne: | |
| 314 phase_call_args.append('ne=' + phase_args.Ne) | |
| 315 | |
| 316 # Assign the random seed, if specified | |
| 317 if phase_args.random_seed: | |
| 318 phase_call_args.append('seed=' + phase_args.random_seed) | |
| 319 | |
| 320 # Assign the chromosome to phase, if specified | |
| 321 if phase_args.phase_chr: | |
| 322 | |
| 323 # Store the chromosome argument | |
| 324 chr_arg = 'chrom=%s' % phase_args.phase_chr | |
| 325 | |
| 326 # Check if either bp position arguments were specified | |
| 327 if phase_args.phase_from_bp or phase_args.phase_to_bp: | |
| 328 | |
| 329 # List of the position arguments, in their required order | |
| 330 position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp] | |
| 331 | |
| 332 # Filter the position arguments to remove empty values | |
| 333 filttered_position_args = filter(None, position_args) | |
| 334 | |
| 335 # Map the arguments to str and add them to the chromosome argument | |
| 336 chr_arg += ''.join(map(str, filttered_position_args)) | |
| 337 | |
| 338 # Add the chromosome argument the argument list | |
| 339 phase_call_args.append(chr_arg) | |
| 340 | |
| 341 # Call beagle wrapper | |
| 342 call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format) | |
| 343 | |
| 344 # Assign expected phased output filename | |
| 345 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) | |
| 346 | |
| 347 # Rename output to phase_args.out, if specified | |
| 348 if phase_args.out: | |
| 349 shutil.move(phased_output, phase_args.out) | |
| 350 shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log') | |
| 351 # Rename log using phased_output | |
| 352 else: | |
| 353 shutil.move(phase_args.out_prefix + '.log', phased_output + '.log') | |
| 354 | |
| 355 logging.info('beagle log file created') | |
| 356 | |
| 357 # Assign general arguments and call shapeit | |
| 358 elif phase_args.phase_algorithm == 'shapeit': | |
| 359 | |
| 360 # Assign the shapeit burn in iter, if specified | |
| 361 if phase_args.shapeit_burn_iter: | |
| 362 phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter]) | |
| 363 | |
| 364 # Assign the shapeit prune iter, if specified | |
| 365 if phase_args.shapeit_prune_iter: | |
| 366 phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter]) | |
| 367 | |
| 368 # Assign the shapeit main iter, if specified | |
| 369 if phase_args.shapeit_main_iter: | |
| 370 phase_call_args.extend(['--main', phase_args.shapeit_main_iter]) | |
| 371 | |
| 372 # Assign the number of shapeit states if specified | |
| 373 if phase_args.shapeit_states: | |
| 374 phase_call_args.extend(['--states', phase_args.shapeit_states]) | |
| 375 | |
| 376 # Assign the shapeit window size, if specified | |
| 377 if phase_args.shapeit_window: | |
| 378 phase_call_args.extend(['--window', phase_args.shapeit_window]) | |
| 379 | |
| 380 # Assign the genetic map, if specified | |
| 381 if phase_args.genetic_map: | |
| 382 phase_call_args.extend(['--input-map', phase_args.genetic_map]) | |
| 383 | |
| 384 # Assign the effective pop size, if specified | |
| 385 if phase_args.Ne: | |
| 386 phase_call_args.extend(['--effective-size', phase_args.Ne]) | |
| 387 | |
| 388 # Assign the random seed, if specified | |
| 389 if phase_args.random_seed: | |
| 390 phase_call_args.extend(['--seed', phase_args.random_seed]) | |
| 391 | |
| 392 # Check if only a single shapeit run is required | |
| 393 if phase_args.phase_chr or len(chrs_in_vcf) == 1: | |
| 394 | |
| 395 # Holds shapeit input filename, as a temp file may be required | |
| 396 shapeit_input_vcf = None | |
| 397 | |
| 398 # Check if a single chromosome is selected | |
| 399 if phase_args.phase_chr and len(chrs_in_vcf) > 1: | |
| 400 # Assign the chromosome-specific input prefix | |
| 401 chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr | |
| 402 | |
| 403 # Assign the chr-specific file as the shapeit input filename | |
| 404 shapeit_input_vcf = chr_input_prefix + '.vcf.gz' | |
| 405 | |
| 406 # Create the chromosome-specific input | |
| 407 chr_subset_file(phase_args.vcf, | |
| 408 phase_args.phase_chr, | |
| 409 chr_input_prefix, | |
| 410 'vcf.gz', | |
| 411 from_bp = phase_args.phase_from_bp, | |
| 412 to_bp = phase_args.phase_to_bp) | |
| 413 | |
| 414 logging.info('Chr %s subset created' % phase_args.phase_chr) | |
| 415 | |
| 416 else: | |
| 417 # Assign the shapeit input filename | |
| 418 shapeit_input_vcf = phase_args.vcf | |
| 419 | |
| 420 # Assigns the input and output arguments for shapeit | |
| 421 phase_call_args.extend(['--input-vcf', shapeit_input_vcf, | |
| 422 '--output-max', phase_args.out_prefix, | |
| 423 '--output-log', phase_args.out_prefix + '.phase.log']) | |
| 424 | |
| 425 # Assign the from bp position argument, if specified | |
| 426 if phase_args.phase_from_bp: | |
| 427 phase_call_args.extend(['--input-from', phase_args.phase_from_bp]) | |
| 428 | |
| 429 # Assign the to bp position argument, if specified | |
| 430 if phase_args.phase_to_bp: | |
| 431 phase_call_args.extend(['--input-to', phase_args.phase_to_bp]) | |
| 432 | |
| 433 # Call shapeit wrapper | |
| 434 call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format) | |
| 435 | |
| 436 # Assign expected phased output filename | |
| 437 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) | |
| 438 | |
| 439 # Combine the log files | |
| 440 concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'], phased_output + '.log') | |
| 441 | |
| 442 # Rename output to phase_args.out, if specified | |
| 443 if phase_args.out: | |
| 444 shutil.move(phased_output, phase_args.out) | |
| 445 shutil.move(phased_output + '.log', phase_args.out + '.log') | |
| 446 | |
| 447 logging.info('shapeit log file created') | |
| 448 | |
| 449 # Check if a chr subset file was created | |
| 450 if phase_args.phase_chr and len(chrs_in_vcf) > 1: | |
| 451 # Delete the chromosome-specific input | |
| 452 os.remove(shapeit_input_vcf) | |
| 453 | |
| 454 logging.info('Chr %s subset deleted' % phase_args.phase_chr) | |
| 455 | |
| 456 # Remove intermediate files created by shapeit | |
| 457 remove_intermediate_files(phase_args.out_prefix) | |
| 458 | |
| 459 # Check if multiple shapeit runs are required | |
| 460 else: | |
| 461 | |
| 462 # List to store the phased filenames | |
| 463 phased_filename_list = [] | |
| 464 | |
| 465 # List to store the phased logs | |
| 466 phased_log_list = [] | |
| 467 | |
| 468 logging.info('Multi-chr shapeit phasing assigned') | |
| 469 | |
| 470 for chr_in_vcf in chrs_in_vcf: | |
| 471 | |
| 472 logging.info('Chr %s assigned' % chr_in_vcf) | |
| 473 | |
| 474 # Copy the arguments for this run | |
| 475 chr_call_args = copy.deepcopy(phase_call_args) | |
| 476 | |
| 477 # Assign the chromosome-specific output prefix | |
| 478 chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf | |
| 479 | |
| 480 # Assign the expected chromosome-specific output filename | |
| 481 chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format) | |
| 482 | |
| 483 # Store the output filename | |
| 484 phased_filename_list.append(chr_out_filename) | |
| 485 | |
| 486 # Assign the chromosome-specific input prefix | |
| 487 chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf | |
| 488 | |
| 489 # Assign the expected chromosome-specific input filename | |
| 490 chr_in_filename = chr_input_prefix + '.vcf.gz' | |
| 491 | |
| 492 # Create the chromosome-specific input | |
| 493 chr_subset_file(phase_args.vcf, | |
| 494 chr_in_vcf, | |
| 495 chr_input_prefix, | |
| 496 'vcf.gz') | |
| 497 | |
| 498 # Assigns the input and output arguments for shapeit | |
| 499 chr_call_args.extend(['--input-vcf', chr_in_filename, | |
| 500 '--output-max', chr_out_prefix, | |
| 501 '--output-log', chr_out_prefix + '.phase.log']) | |
| 502 | |
| 503 # Call shapeit wrapper | |
| 504 call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format) | |
| 505 | |
| 506 # Combine log files | |
| 507 concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'], chr_out_filename + '.log') | |
| 508 | |
| 509 # Store the filename of the combined logs | |
| 510 phased_log_list.append(chr_out_filename + '.log') | |
| 511 | |
| 512 # Delete the chromosome-specific input | |
| 513 os.remove(chr_in_filename) | |
| 514 | |
| 515 # Remove intermediate files created by shapeit | |
| 516 remove_intermediate_files(chr_out_prefix) | |
| 517 | |
| 518 # Concatenate the vcf files | |
| 519 concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format) | |
| 520 | |
| 521 logging.info('Concatenated chromosomes') | |
| 522 | |
| 523 # Assign expected concatenated output filename | |
| 524 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) | |
| 525 | |
| 526 # Combine the log files | |
| 527 concatenate_logs(phased_log_list, phased_output + '.log') | |
| 528 | |
| 529 # Rename output to phase_args.out, if specified | |
| 530 if phase_args.out: | |
| 531 shutil.move(phased_output, phase_args.out) | |
| 532 shutil.move(phased_output + '.log', phase_args.out + '.log') | |
| 533 | |
| 534 logging.info('Multi-chr shapeit log created') | |
| 535 | |
| 536 # Reverts the VCF input file | |
| 537 if vcfname_renamed: | |
| 538 os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)]) | |
| 539 | |
| 540 if __name__ == "__main__": | |
| 541 initLogger() | |
| 542 run() | 
