Mercurial > repos > jaredgk > ppp_vcfphase
comparison vcf_phase.py @ 0:3830d29fca6a draft
Uploaded
author | jaredgk |
---|---|
date | Mon, 15 Oct 2018 18:15:47 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3830d29fca6a |
---|---|
1 import os | |
2 import sys | |
3 import copy | |
4 import shutil | |
5 import argparse | |
6 import glob | |
7 import logging | |
8 | |
9 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) | |
10 | |
11 from vcf_reader_func import checkFormat | |
12 from logging_module import initLogger, logArgs | |
13 from model import read_model_file | |
14 from beagle import call_beagle | |
15 from shapeit import call_shapeit, remove_intermediate_files | |
16 from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate | |
17 | |
18 def phase_argument_parser(passed_arguments): | |
19 '''Phase Argument Parser - Assigns arguments for vcftools from command line. | |
20 Depending on the argument in question, a default value may be specified''' | |
21 | |
22 def parser_confirm_file (): | |
23 '''Custom action to confirm file exists''' | |
24 class customAction(argparse.Action): | |
25 def __call__(self, parser, args, value, option_string=None): | |
26 if not os.path.isfile(value): | |
27 raise IOError('%s not found' % value) | |
28 setattr(args, self.dest, value) | |
29 return customAction | |
30 | |
31 def metavar_list (var_list): | |
32 '''Create a formmated metavar list for the help output''' | |
33 return '{' + ', '.join(var_list) + '}' | |
34 | |
35 phase_parser = argparse.ArgumentParser() | |
36 | |
37 # Input arguments | |
38 phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file()) | |
39 | |
40 # Model file arguments | |
41 phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file()) | |
42 phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str) | |
43 | |
44 # General arguments | |
45 phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true') | |
46 phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/') | |
47 | |
48 # Phase algorithm argument | |
49 phasing_list = ['beagle', 'shapeit'] | |
50 phasing_default = 'beagle' | |
51 phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default) | |
52 | |
53 # Common phasing arguments | |
54 phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int) | |
55 phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int) | |
56 phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file()) | |
57 phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str) | |
58 phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int) | |
59 phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int) | |
60 | |
61 # Shapeit-specific options | |
62 phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int) | |
63 phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int) | |
64 phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int) | |
65 phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int) | |
66 phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float) | |
67 | |
68 # Beagle-specific options | |
69 phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int) | |
70 phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int) | |
71 phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int) | |
72 phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float) | |
73 phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float) | |
74 phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float) | |
75 phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float) | |
76 phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int) | |
77 | |
78 # Output arguments | |
79 phase_parser.add_argument('--out', help = 'Defines the output filename') | |
80 phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out') | |
81 | |
82 out_format_list = ['vcf', 'vcf.gz', 'bcf'] | |
83 out_format_default = 'vcf.gz' | |
84 | |
85 phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default) | |
86 | |
87 if passed_arguments: | |
88 return phase_parser.parse_args(passed_arguments) | |
89 else: | |
90 return phase_parser.parse_args() | |
91 | |
92 def concatenate_logs (log_files, new_log_filename): | |
93 ''' | |
94 Concatenates log files | |
95 | |
96 This function is used to concatenate log files. If a single chromosome | |
97 is being analyzed, the function is used to concatenate the log files | |
98 from multiple wrappers. If multiple chromosomes are being analyzed the | |
99 function is used to concatenate the logs of each chromosome. | |
100 | |
101 Raises | |
102 ------ | |
103 IOError | |
104 If a log file does not exist | |
105 ''' | |
106 | |
107 # Open the new log file | |
108 with open(new_log_filename, 'wb') as out_file: | |
109 # Loop the logs to be combined | |
110 for log_file in log_files: | |
111 # Confirm the log file exists, raise an error if not | |
112 if not os.path.isfile(log_file): | |
113 raise IOError('%s not found' % log_file) | |
114 # Open the log file | |
115 with open(log_file, 'rb') as in_file: | |
116 # Copy the contents of the file | |
117 shutil.copyfileobj(in_file, out_file) | |
118 | |
119 # Remove old log files | |
120 for log_file in log_files: | |
121 os.remove(log_file) | |
122 | |
123 def assign_vcf_extension (filename): | |
124 # Checks if the file is unzipped, bgzipped, or gzipped | |
125 vcfname_format = checkFormat(filename) | |
126 | |
127 if vcfname_format == 'vcf': | |
128 return '.vcf' | |
129 | |
130 elif vcfname_format == 'gzip' or vcfname_format == 'bgzip': | |
131 return '.vcf.gz' | |
132 | |
133 elif vcfname_format == 'bcf': | |
134 raise Exception('BCF not supported in current version') | |
135 | |
136 else: | |
137 raise Exception('Unknown file format') | |
138 | |
139 def run (passed_arguments = []): | |
140 ''' | |
141 Phaser for VCF files. | |
142 | |
143 Automates the phasing process for a specified VCF file. The function | |
144 allows users to select between multiple phasing algorithms: beagle | |
145 (default) and shapit. | |
146 | |
147 Parameters | |
148 ---------- | |
149 --vcf : str | |
150 Specifies the input VCF filename | |
151 --phase-algorithm : str | |
152 Specifies the algorithm to be used. Choices: beagle (default) and | |
153 shapit | |
154 --out : str | |
155 Specifies the output filename | |
156 | |
157 Returns | |
158 ------- | |
159 output : file | |
160 Phased VCF file | |
161 | |
162 Raises | |
163 ------ | |
164 IOError | |
165 Input VCF file does not exist | |
166 IOError | |
167 Output file already exists | |
168 | |
169 ''' | |
170 | |
171 # Grab VCF arguments from command line | |
172 phase_args = phase_argument_parser(passed_arguments) | |
173 | |
174 # Adds the arguments (i.e. parameters) to the log file | |
175 logArgs(phase_args, func_name = 'vcf_phase') | |
176 | |
177 # Assign file extension for VCF input file | |
178 vcfname_ext = assign_vcf_extension(phase_args.vcf) | |
179 | |
180 logging.info('Input file assigned') | |
181 | |
182 # Used to confirm if the VCF input was renamed | |
183 vcfname_renamed = False | |
184 | |
185 # Confirm input has correct file extension | |
186 if vcfname_ext not in phase_args.vcf: | |
187 vcfname_renamed = True | |
188 os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext) | |
189 phase_args.vcf += vcfname_ext | |
190 | |
191 # List to hold beagle/shapeit arguments | |
192 phase_call_args = [] | |
193 | |
194 # bool to check if an eclude file was created | |
195 exclude_file_created = False | |
196 | |
197 # bool to check if an include file was created | |
198 include_file_created = False | |
199 | |
200 # Performs actions related to --model-file and argument assignment | |
201 if phase_args.model_file: | |
202 | |
203 # Check if a model has been specified | |
204 if not phase_args.model: | |
205 raise IOError('No selected model. Please use --model to select a model') | |
206 | |
207 # Read in the model file | |
208 models_file = read_model_file(phase_args.model_file) | |
209 | |
210 # Check that the selected model was found in the file | |
211 if phase_args.model not in models_file: | |
212 raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file)) | |
213 | |
214 logging.info('Model assigned') | |
215 | |
216 if phase_args.phase_algorithm == 'beagle': | |
217 | |
218 # Get list of individuals to include | |
219 inds_in_model = models_file[phase_args.model].inds | |
220 | |
221 # Create file with individuals to exclude (beagle has no include option) | |
222 models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite) | |
223 | |
224 # Assign exclude file | |
225 phase_call_args.append('excludesamples=' + models_file.exclude_file) | |
226 | |
227 # Confirm that the exclude file was created | |
228 exclude_file_created = True | |
229 | |
230 elif phase_args.phase_algorithm == 'shapeit': | |
231 | |
232 # Assign the model | |
233 selected_model = models_file[phase_args.model] | |
234 | |
235 # Create file with individuals to exclude (beagle has no include option) | |
236 selected_model.create_ind_file(overwrite = phase_args.overwrite) | |
237 | |
238 # Assign exclude file | |
239 phase_call_args.extend(['--include-ind', selected_model.ind_file]) | |
240 | |
241 # Confirm that the include file was created | |
242 include_file_created = True | |
243 | |
244 # Get the list of chromosomes within the VCF | |
245 chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf) | |
246 | |
247 # Check if the user specified a specific chromosome | |
248 if phase_args.phase_chr: | |
249 # Check if the chromosome is within the file | |
250 if phase_args.phase_chr not in chrs_in_vcf: | |
251 raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf)) | |
252 | |
253 # Check that a chr was specified if a bp-based argument was specified | |
254 if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr: | |
255 # Check if the file has a single chromosome | |
256 if len(chrs_in_vcf) == 1: | |
257 # Check if the beagle algorithm is being used | |
258 if phase_args.phase_algorithm == 'beagle': | |
259 # Assign the chromosome, as beagle requires it to be assigned | |
260 phase_args.phase_chr = chrs_in_vcf[0] | |
261 | |
262 logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0]) | |
263 # Report error if multiple chromosomes are within the file | |
264 else: | |
265 raise Exception('The --phase-from-bp and --phase-to-bp arguments ' | |
266 'require the --phase-chr argument to function if ' | |
267 'multiple chrs are within %s' % phase_args.vcf) | |
268 | |
269 # Assign general arguments and call beagle | |
270 if phase_args.phase_algorithm == 'beagle': | |
271 | |
272 # Assigns the input and output arguments | |
273 phase_call_args.extend(['gt=' + phase_args.vcf, | |
274 'out=' + phase_args.out_prefix]) | |
275 | |
276 # Assign the burn-in iter, if specified | |
277 if phase_args.beagle_burn_iter: | |
278 phase_call_args.append('burnin=' + phase_args.beagle_burn_iter) | |
279 | |
280 # Assign the iter, if specified | |
281 if phase_args.beagle_iter: | |
282 phase_call_args.append('iterations=' + phase_args.beagle_iter) | |
283 | |
284 # Assign the state count, if specified | |
285 if phase_args.beagle_states: | |
286 phase_call_args.append('phase-states=' + phase_args.beagle_states) | |
287 | |
288 # Assign the window length, if specified | |
289 if phase_args.beagle_window: | |
290 phase_call_args.append('window=' + phase_args.beagle_window) | |
291 | |
292 # Assign the window overlap, if specified | |
293 if phase_args.beagle_overlap: | |
294 phase_call_args.append('overlap=' + phase_args.beagle_overlap) | |
295 | |
296 # Assign the HMM error, if specified | |
297 if phase_args.beagle_error: | |
298 phase_call_args.append('err=' + phase_args.beagle_error) | |
299 | |
300 # Assign the step length, if specified | |
301 if phase_args.beagle_step: | |
302 phase_call_args.append('step=' + phase_args.beagle_step) | |
303 | |
304 # Assign the number of steps, if specified | |
305 if phase_args.beagle_nsteps: | |
306 phase_call_args.append('nsteps=' + phase_args.beagle_nsteps) | |
307 | |
308 # Assign the genetic map, if specified | |
309 if phase_args.genetic_map: | |
310 phase_call_args.append('map=' + phase_args.genetic_map) | |
311 | |
312 # Assign the effective pop size, if specified | |
313 if phase_args.Ne: | |
314 phase_call_args.append('ne=' + phase_args.Ne) | |
315 | |
316 # Assign the random seed, if specified | |
317 if phase_args.random_seed: | |
318 phase_call_args.append('seed=' + phase_args.random_seed) | |
319 | |
320 # Assign the chromosome to phase, if specified | |
321 if phase_args.phase_chr: | |
322 | |
323 # Store the chromosome argument | |
324 chr_arg = 'chrom=%s' % phase_args.phase_chr | |
325 | |
326 # Check if either bp position arguments were specified | |
327 if phase_args.phase_from_bp or phase_args.phase_to_bp: | |
328 | |
329 # List of the position arguments, in their required order | |
330 position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp] | |
331 | |
332 # Filter the position arguments to remove empty values | |
333 filttered_position_args = filter(None, position_args) | |
334 | |
335 # Map the arguments to str and add them to the chromosome argument | |
336 chr_arg += ''.join(map(str, filttered_position_args)) | |
337 | |
338 # Add the chromosome argument the argument list | |
339 phase_call_args.append(chr_arg) | |
340 | |
341 # Call beagle wrapper | |
342 call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format) | |
343 | |
344 # Assign expected phased output filename | |
345 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) | |
346 | |
347 # Rename output to phase_args.out, if specified | |
348 if phase_args.out: | |
349 shutil.move(phased_output, phase_args.out) | |
350 shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log') | |
351 # Rename log using phased_output | |
352 else: | |
353 shutil.move(phase_args.out_prefix + '.log', phased_output + '.log') | |
354 | |
355 logging.info('beagle log file created') | |
356 | |
357 # Assign general arguments and call shapeit | |
358 elif phase_args.phase_algorithm == 'shapeit': | |
359 | |
360 # Assign the shapeit burn in iter, if specified | |
361 if phase_args.shapeit_burn_iter: | |
362 phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter]) | |
363 | |
364 # Assign the shapeit prune iter, if specified | |
365 if phase_args.shapeit_prune_iter: | |
366 phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter]) | |
367 | |
368 # Assign the shapeit main iter, if specified | |
369 if phase_args.shapeit_main_iter: | |
370 phase_call_args.extend(['--main', phase_args.shapeit_main_iter]) | |
371 | |
372 # Assign the number of shapeit states if specified | |
373 if phase_args.shapeit_states: | |
374 phase_call_args.extend(['--states', phase_args.shapeit_states]) | |
375 | |
376 # Assign the shapeit window size, if specified | |
377 if phase_args.shapeit_window: | |
378 phase_call_args.extend(['--window', phase_args.shapeit_window]) | |
379 | |
380 # Assign the genetic map, if specified | |
381 if phase_args.genetic_map: | |
382 phase_call_args.extend(['--input-map', phase_args.genetic_map]) | |
383 | |
384 # Assign the effective pop size, if specified | |
385 if phase_args.Ne: | |
386 phase_call_args.extend(['--effective-size', phase_args.Ne]) | |
387 | |
388 # Assign the random seed, if specified | |
389 if phase_args.random_seed: | |
390 phase_call_args.extend(['--seed', phase_args.random_seed]) | |
391 | |
392 # Check if only a single shapeit run is required | |
393 if phase_args.phase_chr or len(chrs_in_vcf) == 1: | |
394 | |
395 # Holds shapeit input filename, as a temp file may be required | |
396 shapeit_input_vcf = None | |
397 | |
398 # Check if a single chromosome is selected | |
399 if phase_args.phase_chr and len(chrs_in_vcf) > 1: | |
400 # Assign the chromosome-specific input prefix | |
401 chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr | |
402 | |
403 # Assign the chr-specific file as the shapeit input filename | |
404 shapeit_input_vcf = chr_input_prefix + '.vcf.gz' | |
405 | |
406 # Create the chromosome-specific input | |
407 chr_subset_file(phase_args.vcf, | |
408 phase_args.phase_chr, | |
409 chr_input_prefix, | |
410 'vcf.gz', | |
411 from_bp = phase_args.phase_from_bp, | |
412 to_bp = phase_args.phase_to_bp) | |
413 | |
414 logging.info('Chr %s subset created' % phase_args.phase_chr) | |
415 | |
416 else: | |
417 # Assign the shapeit input filename | |
418 shapeit_input_vcf = phase_args.vcf | |
419 | |
420 # Assigns the input and output arguments for shapeit | |
421 phase_call_args.extend(['--input-vcf', shapeit_input_vcf, | |
422 '--output-max', phase_args.out_prefix, | |
423 '--output-log', phase_args.out_prefix + '.phase.log']) | |
424 | |
425 # Assign the from bp position argument, if specified | |
426 if phase_args.phase_from_bp: | |
427 phase_call_args.extend(['--input-from', phase_args.phase_from_bp]) | |
428 | |
429 # Assign the to bp position argument, if specified | |
430 if phase_args.phase_to_bp: | |
431 phase_call_args.extend(['--input-to', phase_args.phase_to_bp]) | |
432 | |
433 # Call shapeit wrapper | |
434 call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format) | |
435 | |
436 # Assign expected phased output filename | |
437 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) | |
438 | |
439 # Combine the log files | |
440 concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'], phased_output + '.log') | |
441 | |
442 # Rename output to phase_args.out, if specified | |
443 if phase_args.out: | |
444 shutil.move(phased_output, phase_args.out) | |
445 shutil.move(phased_output + '.log', phase_args.out + '.log') | |
446 | |
447 logging.info('shapeit log file created') | |
448 | |
449 # Check if a chr subset file was created | |
450 if phase_args.phase_chr and len(chrs_in_vcf) > 1: | |
451 # Delete the chromosome-specific input | |
452 os.remove(shapeit_input_vcf) | |
453 | |
454 logging.info('Chr %s subset deleted' % phase_args.phase_chr) | |
455 | |
456 # Remove intermediate files created by shapeit | |
457 remove_intermediate_files(phase_args.out_prefix) | |
458 | |
459 # Check if multiple shapeit runs are required | |
460 else: | |
461 | |
462 # List to store the phased filenames | |
463 phased_filename_list = [] | |
464 | |
465 # List to store the phased logs | |
466 phased_log_list = [] | |
467 | |
468 logging.info('Multi-chr shapeit phasing assigned') | |
469 | |
470 for chr_in_vcf in chrs_in_vcf: | |
471 | |
472 logging.info('Chr %s assigned' % chr_in_vcf) | |
473 | |
474 # Copy the arguments for this run | |
475 chr_call_args = copy.deepcopy(phase_call_args) | |
476 | |
477 # Assign the chromosome-specific output prefix | |
478 chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf | |
479 | |
480 # Assign the expected chromosome-specific output filename | |
481 chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format) | |
482 | |
483 # Store the output filename | |
484 phased_filename_list.append(chr_out_filename) | |
485 | |
486 # Assign the chromosome-specific input prefix | |
487 chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf | |
488 | |
489 # Assign the expected chromosome-specific input filename | |
490 chr_in_filename = chr_input_prefix + '.vcf.gz' | |
491 | |
492 # Create the chromosome-specific input | |
493 chr_subset_file(phase_args.vcf, | |
494 chr_in_vcf, | |
495 chr_input_prefix, | |
496 'vcf.gz') | |
497 | |
498 # Assigns the input and output arguments for shapeit | |
499 chr_call_args.extend(['--input-vcf', chr_in_filename, | |
500 '--output-max', chr_out_prefix, | |
501 '--output-log', chr_out_prefix + '.phase.log']) | |
502 | |
503 # Call shapeit wrapper | |
504 call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format) | |
505 | |
506 # Combine log files | |
507 concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'], chr_out_filename + '.log') | |
508 | |
509 # Store the filename of the combined logs | |
510 phased_log_list.append(chr_out_filename + '.log') | |
511 | |
512 # Delete the chromosome-specific input | |
513 os.remove(chr_in_filename) | |
514 | |
515 # Remove intermediate files created by shapeit | |
516 remove_intermediate_files(chr_out_prefix) | |
517 | |
518 # Concatenate the vcf files | |
519 concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format) | |
520 | |
521 logging.info('Concatenated chromosomes') | |
522 | |
523 # Assign expected concatenated output filename | |
524 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format) | |
525 | |
526 # Combine the log files | |
527 concatenate_logs(phased_log_list, phased_output + '.log') | |
528 | |
529 # Rename output to phase_args.out, if specified | |
530 if phase_args.out: | |
531 shutil.move(phased_output, phase_args.out) | |
532 shutil.move(phased_output + '.log', phase_args.out + '.log') | |
533 | |
534 logging.info('Multi-chr shapeit log created') | |
535 | |
536 # Reverts the VCF input file | |
537 if vcfname_renamed: | |
538 os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)]) | |
539 | |
540 if __name__ == "__main__": | |
541 initLogger() | |
542 run() |