comparison vcf_phase.py @ 0:3830d29fca6a draft

Uploaded
author jaredgk
date Mon, 15 Oct 2018 18:15:47 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3830d29fca6a
1 import os
2 import sys
3 import copy
4 import shutil
5 import argparse
6 import glob
7 import logging
8
9 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
10
11 from vcf_reader_func import checkFormat
12 from logging_module import initLogger, logArgs
13 from model import read_model_file
14 from beagle import call_beagle
15 from shapeit import call_shapeit, remove_intermediate_files
16 from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate
17
18 def phase_argument_parser(passed_arguments):
19 '''Phase Argument Parser - Assigns arguments for vcftools from command line.
20 Depending on the argument in question, a default value may be specified'''
21
22 def parser_confirm_file ():
23 '''Custom action to confirm file exists'''
24 class customAction(argparse.Action):
25 def __call__(self, parser, args, value, option_string=None):
26 if not os.path.isfile(value):
27 raise IOError('%s not found' % value)
28 setattr(args, self.dest, value)
29 return customAction
30
31 def metavar_list (var_list):
32 '''Create a formmated metavar list for the help output'''
33 return '{' + ', '.join(var_list) + '}'
34
35 phase_parser = argparse.ArgumentParser()
36
37 # Input arguments
38 phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file())
39
40 # Model file arguments
41 phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file())
42 phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str)
43
44 # General arguments
45 phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true')
46 phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/')
47
48 # Phase algorithm argument
49 phasing_list = ['beagle', 'shapeit']
50 phasing_default = 'beagle'
51 phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default)
52
53 # Common phasing arguments
54 phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int)
55 phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int)
56 phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file())
57 phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str)
58 phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int)
59 phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int)
60
61 # Shapeit-specific options
62 phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int)
63 phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int)
64 phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int)
65 phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int)
66 phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float)
67
68 # Beagle-specific options
69 phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int)
70 phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int)
71 phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int)
72 phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float)
73 phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float)
74 phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float)
75 phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float)
76 phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int)
77
78 # Output arguments
79 phase_parser.add_argument('--out', help = 'Defines the output filename')
80 phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out')
81
82 out_format_list = ['vcf', 'vcf.gz', 'bcf']
83 out_format_default = 'vcf.gz'
84
85 phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default)
86
87 if passed_arguments:
88 return phase_parser.parse_args(passed_arguments)
89 else:
90 return phase_parser.parse_args()
91
92 def concatenate_logs (log_files, new_log_filename):
93 '''
94 Concatenates log files
95
96 This function is used to concatenate log files. If a single chromosome
97 is being analyzed, the function is used to concatenate the log files
98 from multiple wrappers. If multiple chromosomes are being analyzed the
99 function is used to concatenate the logs of each chromosome.
100
101 Raises
102 ------
103 IOError
104 If a log file does not exist
105 '''
106
107 # Open the new log file
108 with open(new_log_filename, 'wb') as out_file:
109 # Loop the logs to be combined
110 for log_file in log_files:
111 # Confirm the log file exists, raise an error if not
112 if not os.path.isfile(log_file):
113 raise IOError('%s not found' % log_file)
114 # Open the log file
115 with open(log_file, 'rb') as in_file:
116 # Copy the contents of the file
117 shutil.copyfileobj(in_file, out_file)
118
119 # Remove old log files
120 for log_file in log_files:
121 os.remove(log_file)
122
123 def assign_vcf_extension (filename):
124 # Checks if the file is unzipped, bgzipped, or gzipped
125 vcfname_format = checkFormat(filename)
126
127 if vcfname_format == 'vcf':
128 return '.vcf'
129
130 elif vcfname_format == 'gzip' or vcfname_format == 'bgzip':
131 return '.vcf.gz'
132
133 elif vcfname_format == 'bcf':
134 raise Exception('BCF not supported in current version')
135
136 else:
137 raise Exception('Unknown file format')
138
139 def run (passed_arguments = []):
140 '''
141 Phaser for VCF files.
142
143 Automates the phasing process for a specified VCF file. The function
144 allows users to select between multiple phasing algorithms: beagle
145 (default) and shapit.
146
147 Parameters
148 ----------
149 --vcf : str
150 Specifies the input VCF filename
151 --phase-algorithm : str
152 Specifies the algorithm to be used. Choices: beagle (default) and
153 shapit
154 --out : str
155 Specifies the output filename
156
157 Returns
158 -------
159 output : file
160 Phased VCF file
161
162 Raises
163 ------
164 IOError
165 Input VCF file does not exist
166 IOError
167 Output file already exists
168
169 '''
170
171 # Grab VCF arguments from command line
172 phase_args = phase_argument_parser(passed_arguments)
173
174 # Adds the arguments (i.e. parameters) to the log file
175 logArgs(phase_args, func_name = 'vcf_phase')
176
177 # Assign file extension for VCF input file
178 vcfname_ext = assign_vcf_extension(phase_args.vcf)
179
180 logging.info('Input file assigned')
181
182 # Used to confirm if the VCF input was renamed
183 vcfname_renamed = False
184
185 # Confirm input has correct file extension
186 if vcfname_ext not in phase_args.vcf:
187 vcfname_renamed = True
188 os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext)
189 phase_args.vcf += vcfname_ext
190
191 # List to hold beagle/shapeit arguments
192 phase_call_args = []
193
194 # bool to check if an eclude file was created
195 exclude_file_created = False
196
197 # bool to check if an include file was created
198 include_file_created = False
199
200 # Performs actions related to --model-file and argument assignment
201 if phase_args.model_file:
202
203 # Check if a model has been specified
204 if not phase_args.model:
205 raise IOError('No selected model. Please use --model to select a model')
206
207 # Read in the model file
208 models_file = read_model_file(phase_args.model_file)
209
210 # Check that the selected model was found in the file
211 if phase_args.model not in models_file:
212 raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file))
213
214 logging.info('Model assigned')
215
216 if phase_args.phase_algorithm == 'beagle':
217
218 # Get list of individuals to include
219 inds_in_model = models_file[phase_args.model].inds
220
221 # Create file with individuals to exclude (beagle has no include option)
222 models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite)
223
224 # Assign exclude file
225 phase_call_args.append('excludesamples=' + models_file.exclude_file)
226
227 # Confirm that the exclude file was created
228 exclude_file_created = True
229
230 elif phase_args.phase_algorithm == 'shapeit':
231
232 # Assign the model
233 selected_model = models_file[phase_args.model]
234
235 # Create file with individuals to exclude (beagle has no include option)
236 selected_model.create_ind_file(overwrite = phase_args.overwrite)
237
238 # Assign exclude file
239 phase_call_args.extend(['--include-ind', selected_model.ind_file])
240
241 # Confirm that the include file was created
242 include_file_created = True
243
244 # Get the list of chromosomes within the VCF
245 chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf)
246
247 # Check if the user specified a specific chromosome
248 if phase_args.phase_chr:
249 # Check if the chromosome is within the file
250 if phase_args.phase_chr not in chrs_in_vcf:
251 raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf))
252
253 # Check that a chr was specified if a bp-based argument was specified
254 if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr:
255 # Check if the file has a single chromosome
256 if len(chrs_in_vcf) == 1:
257 # Check if the beagle algorithm is being used
258 if phase_args.phase_algorithm == 'beagle':
259 # Assign the chromosome, as beagle requires it to be assigned
260 phase_args.phase_chr = chrs_in_vcf[0]
261
262 logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0])
263 # Report error if multiple chromosomes are within the file
264 else:
265 raise Exception('The --phase-from-bp and --phase-to-bp arguments '
266 'require the --phase-chr argument to function if '
267 'multiple chrs are within %s' % phase_args.vcf)
268
269 # Assign general arguments and call beagle
270 if phase_args.phase_algorithm == 'beagle':
271
272 # Assigns the input and output arguments
273 phase_call_args.extend(['gt=' + phase_args.vcf,
274 'out=' + phase_args.out_prefix])
275
276 # Assign the burn-in iter, if specified
277 if phase_args.beagle_burn_iter:
278 phase_call_args.append('burnin=' + phase_args.beagle_burn_iter)
279
280 # Assign the iter, if specified
281 if phase_args.beagle_iter:
282 phase_call_args.append('iterations=' + phase_args.beagle_iter)
283
284 # Assign the state count, if specified
285 if phase_args.beagle_states:
286 phase_call_args.append('phase-states=' + phase_args.beagle_states)
287
288 # Assign the window length, if specified
289 if phase_args.beagle_window:
290 phase_call_args.append('window=' + phase_args.beagle_window)
291
292 # Assign the window overlap, if specified
293 if phase_args.beagle_overlap:
294 phase_call_args.append('overlap=' + phase_args.beagle_overlap)
295
296 # Assign the HMM error, if specified
297 if phase_args.beagle_error:
298 phase_call_args.append('err=' + phase_args.beagle_error)
299
300 # Assign the step length, if specified
301 if phase_args.beagle_step:
302 phase_call_args.append('step=' + phase_args.beagle_step)
303
304 # Assign the number of steps, if specified
305 if phase_args.beagle_nsteps:
306 phase_call_args.append('nsteps=' + phase_args.beagle_nsteps)
307
308 # Assign the genetic map, if specified
309 if phase_args.genetic_map:
310 phase_call_args.append('map=' + phase_args.genetic_map)
311
312 # Assign the effective pop size, if specified
313 if phase_args.Ne:
314 phase_call_args.append('ne=' + phase_args.Ne)
315
316 # Assign the random seed, if specified
317 if phase_args.random_seed:
318 phase_call_args.append('seed=' + phase_args.random_seed)
319
320 # Assign the chromosome to phase, if specified
321 if phase_args.phase_chr:
322
323 # Store the chromosome argument
324 chr_arg = 'chrom=%s' % phase_args.phase_chr
325
326 # Check if either bp position arguments were specified
327 if phase_args.phase_from_bp or phase_args.phase_to_bp:
328
329 # List of the position arguments, in their required order
330 position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp]
331
332 # Filter the position arguments to remove empty values
333 filttered_position_args = filter(None, position_args)
334
335 # Map the arguments to str and add them to the chromosome argument
336 chr_arg += ''.join(map(str, filttered_position_args))
337
338 # Add the chromosome argument the argument list
339 phase_call_args.append(chr_arg)
340
341 # Call beagle wrapper
342 call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)
343
344 # Assign expected phased output filename
345 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
346
347 # Rename output to phase_args.out, if specified
348 if phase_args.out:
349 shutil.move(phased_output, phase_args.out)
350 shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log')
351 # Rename log using phased_output
352 else:
353 shutil.move(phase_args.out_prefix + '.log', phased_output + '.log')
354
355 logging.info('beagle log file created')
356
357 # Assign general arguments and call shapeit
358 elif phase_args.phase_algorithm == 'shapeit':
359
360 # Assign the shapeit burn in iter, if specified
361 if phase_args.shapeit_burn_iter:
362 phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter])
363
364 # Assign the shapeit prune iter, if specified
365 if phase_args.shapeit_prune_iter:
366 phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter])
367
368 # Assign the shapeit main iter, if specified
369 if phase_args.shapeit_main_iter:
370 phase_call_args.extend(['--main', phase_args.shapeit_main_iter])
371
372 # Assign the number of shapeit states if specified
373 if phase_args.shapeit_states:
374 phase_call_args.extend(['--states', phase_args.shapeit_states])
375
376 # Assign the shapeit window size, if specified
377 if phase_args.shapeit_window:
378 phase_call_args.extend(['--window', phase_args.shapeit_window])
379
380 # Assign the genetic map, if specified
381 if phase_args.genetic_map:
382 phase_call_args.extend(['--input-map', phase_args.genetic_map])
383
384 # Assign the effective pop size, if specified
385 if phase_args.Ne:
386 phase_call_args.extend(['--effective-size', phase_args.Ne])
387
388 # Assign the random seed, if specified
389 if phase_args.random_seed:
390 phase_call_args.extend(['--seed', phase_args.random_seed])
391
392 # Check if only a single shapeit run is required
393 if phase_args.phase_chr or len(chrs_in_vcf) == 1:
394
395 # Holds shapeit input filename, as a temp file may be required
396 shapeit_input_vcf = None
397
398 # Check if a single chromosome is selected
399 if phase_args.phase_chr and len(chrs_in_vcf) > 1:
400 # Assign the chromosome-specific input prefix
401 chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr
402
403 # Assign the chr-specific file as the shapeit input filename
404 shapeit_input_vcf = chr_input_prefix + '.vcf.gz'
405
406 # Create the chromosome-specific input
407 chr_subset_file(phase_args.vcf,
408 phase_args.phase_chr,
409 chr_input_prefix,
410 'vcf.gz',
411 from_bp = phase_args.phase_from_bp,
412 to_bp = phase_args.phase_to_bp)
413
414 logging.info('Chr %s subset created' % phase_args.phase_chr)
415
416 else:
417 # Assign the shapeit input filename
418 shapeit_input_vcf = phase_args.vcf
419
420 # Assigns the input and output arguments for shapeit
421 phase_call_args.extend(['--input-vcf', shapeit_input_vcf,
422 '--output-max', phase_args.out_prefix,
423 '--output-log', phase_args.out_prefix + '.phase.log'])
424
425 # Assign the from bp position argument, if specified
426 if phase_args.phase_from_bp:
427 phase_call_args.extend(['--input-from', phase_args.phase_from_bp])
428
429 # Assign the to bp position argument, if specified
430 if phase_args.phase_to_bp:
431 phase_call_args.extend(['--input-to', phase_args.phase_to_bp])
432
433 # Call shapeit wrapper
434 call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)
435
436 # Assign expected phased output filename
437 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
438
439 # Combine the log files
440 concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'], phased_output + '.log')
441
442 # Rename output to phase_args.out, if specified
443 if phase_args.out:
444 shutil.move(phased_output, phase_args.out)
445 shutil.move(phased_output + '.log', phase_args.out + '.log')
446
447 logging.info('shapeit log file created')
448
449 # Check if a chr subset file was created
450 if phase_args.phase_chr and len(chrs_in_vcf) > 1:
451 # Delete the chromosome-specific input
452 os.remove(shapeit_input_vcf)
453
454 logging.info('Chr %s subset deleted' % phase_args.phase_chr)
455
456 # Remove intermediate files created by shapeit
457 remove_intermediate_files(phase_args.out_prefix)
458
459 # Check if multiple shapeit runs are required
460 else:
461
462 # List to store the phased filenames
463 phased_filename_list = []
464
465 # List to store the phased logs
466 phased_log_list = []
467
468 logging.info('Multi-chr shapeit phasing assigned')
469
470 for chr_in_vcf in chrs_in_vcf:
471
472 logging.info('Chr %s assigned' % chr_in_vcf)
473
474 # Copy the arguments for this run
475 chr_call_args = copy.deepcopy(phase_call_args)
476
477 # Assign the chromosome-specific output prefix
478 chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf
479
480 # Assign the expected chromosome-specific output filename
481 chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format)
482
483 # Store the output filename
484 phased_filename_list.append(chr_out_filename)
485
486 # Assign the chromosome-specific input prefix
487 chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf
488
489 # Assign the expected chromosome-specific input filename
490 chr_in_filename = chr_input_prefix + '.vcf.gz'
491
492 # Create the chromosome-specific input
493 chr_subset_file(phase_args.vcf,
494 chr_in_vcf,
495 chr_input_prefix,
496 'vcf.gz')
497
498 # Assigns the input and output arguments for shapeit
499 chr_call_args.extend(['--input-vcf', chr_in_filename,
500 '--output-max', chr_out_prefix,
501 '--output-log', chr_out_prefix + '.phase.log'])
502
503 # Call shapeit wrapper
504 call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format)
505
506 # Combine log files
507 concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'], chr_out_filename + '.log')
508
509 # Store the filename of the combined logs
510 phased_log_list.append(chr_out_filename + '.log')
511
512 # Delete the chromosome-specific input
513 os.remove(chr_in_filename)
514
515 # Remove intermediate files created by shapeit
516 remove_intermediate_files(chr_out_prefix)
517
518 # Concatenate the vcf files
519 concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format)
520
521 logging.info('Concatenated chromosomes')
522
523 # Assign expected concatenated output filename
524 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
525
526 # Combine the log files
527 concatenate_logs(phased_log_list, phased_output + '.log')
528
529 # Rename output to phase_args.out, if specified
530 if phase_args.out:
531 shutil.move(phased_output, phase_args.out)
532 shutil.move(phased_output + '.log', phase_args.out + '.log')
533
534 logging.info('Multi-chr shapeit log created')
535
536 # Reverts the VCF input file
537 if vcfname_renamed:
538 os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)])
539
540 if __name__ == "__main__":
541 initLogger()
542 run()