annotate vcf_phase.py @ 3:d1e3db7f6521 draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:28:38 -0400
parents 3830d29fca6a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
1 import os
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
2 import sys
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
3 import copy
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
4 import shutil
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
5 import argparse
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
6 import glob
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
7 import logging
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
8
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
9 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
10
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
11 from vcf_reader_func import checkFormat
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
12 from logging_module import initLogger, logArgs
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
13 from model import read_model_file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
14 from beagle import call_beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
15 from shapeit import call_shapeit, remove_intermediate_files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
16 from bcftools import pipe_bcftools_to_chr, chr_subset_file, concatenate
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
17
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
18 def phase_argument_parser(passed_arguments):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
19 '''Phase Argument Parser - Assigns arguments for vcftools from command line.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
20 Depending on the argument in question, a default value may be specified'''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
21
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
22 def parser_confirm_file ():
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
23 '''Custom action to confirm file exists'''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
24 class customAction(argparse.Action):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
25 def __call__(self, parser, args, value, option_string=None):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
26 if not os.path.isfile(value):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
27 raise IOError('%s not found' % value)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
28 setattr(args, self.dest, value)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
29 return customAction
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
30
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
31 def metavar_list (var_list):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
32 '''Create a formmated metavar list for the help output'''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
33 return '{' + ', '.join(var_list) + '}'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
34
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
35 phase_parser = argparse.ArgumentParser()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
36
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
37 # Input arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
38 phase_parser.add_argument('--vcf', help = "Input VCF filename", type = str, required = True, action = parser_confirm_file())
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
39
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
40 # Model file arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
41 phase_parser.add_argument('--model-file', help = 'Defines the model file', type = str, action = parser_confirm_file())
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
42 phase_parser.add_argument('--model', help = 'Defines the model to analyze', type = str)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
43
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
44 # General arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
45 phase_parser.add_argument('--overwrite', help = "Overwrite previous output files", action = 'store_true')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
46 phase_parser.add_argument('--beagle-path', help = "Defines path to locate beagle.jar", type = str, default = 'bin/')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
47
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
48 # Phase algorithm argument
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
49 phasing_list = ['beagle', 'shapeit']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
50 phasing_default = 'beagle'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
51 phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
52
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
53 # Common phasing arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
54 phase_parser.add_argument('--Ne', help = 'Defines the effective population size', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
55 phase_parser.add_argument('--random-seed', help="Defines the random seed value for the random number generator", type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
56 phase_parser.add_argument('--genetic-map', help = 'Genetic map filename', type = str, action = parser_confirm_file())
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
57 phase_parser.add_argument('--phase-chr', help = 'Selects a single chromosome to phase', type = str)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
58 phase_parser.add_argument('--phase-from-bp', help = 'Lower bound of sites to include (Only usable with a single chromosome)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
59 phase_parser.add_argument('--phase-to-bp', help = 'Upper bound of sites to include (Only usable with a single chromosome)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
60
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
61 # Shapeit-specific options
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
62 phase_parser.add_argument('--shapeit-burn-iter', help = 'Number of the burn-in iterations (shapeit)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
63 phase_parser.add_argument('--shapeit-prune-iter', help = 'Number of pruning iterations (shapeit)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
64 phase_parser.add_argument('--shapeit-main-iter', help = 'Number of main iterations (shapeit)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
65 phase_parser.add_argument('--shapeit-states', help = 'Number of conditioning states for haplotype estimation (shapeit)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
66 phase_parser.add_argument('--shapeit-window', help = 'Model window size in Mb (shapeit)', type = float)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
67
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
68 # Beagle-specific options
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
69 phase_parser.add_argument('--beagle-burn-iter', help = 'Number of the burn-in iterations (beagle)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
70 phase_parser.add_argument('--beagle-iter', help = 'Number of iterations after burn-in (beagle)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
71 phase_parser.add_argument('--beagle-states', help = 'Number of model states for genotype estimation (beagle)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
72 phase_parser.add_argument('--beagle-window', help = 'Sliding window size in cM (beagle)', type = float)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
73 phase_parser.add_argument('--beagle-overlap', help = 'Overlap between neighboring sliding windows in cM (beagle)', type = float)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
74 phase_parser.add_argument('--beagle-error', help = 'HMM allele mismatch probability (beagle)', type = float)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
75 phase_parser.add_argument('--beagle-step', help = 'Step length in cM used for identifying short IBS segments (beagle)', type = float)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
76 phase_parser.add_argument('--beagle-nsteps', help = 'Number of consecutive --beagle-steps used for identifying long IBS segments (beagle)', type = int)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
77
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
78 # Output arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
79 phase_parser.add_argument('--out', help = 'Defines the output filename')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
80 phase_parser.add_argument('--out-prefix', help = 'Defines the output prefix (used by phasing algorithms)', default = 'out')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
81
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
82 out_format_list = ['vcf', 'vcf.gz', 'bcf']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
83 out_format_default = 'vcf.gz'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
84
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
85 phase_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
86
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
87 if passed_arguments:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
88 return phase_parser.parse_args(passed_arguments)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
89 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
90 return phase_parser.parse_args()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
91
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
92 def concatenate_logs (log_files, new_log_filename):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
93 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
94 Concatenates log files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
95
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
96 This function is used to concatenate log files. If a single chromosome
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
97 is being analyzed, the function is used to concatenate the log files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
98 from multiple wrappers. If multiple chromosomes are being analyzed the
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
99 function is used to concatenate the logs of each chromosome.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
100
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
101 Raises
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
102 ------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
103 IOError
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
104 If a log file does not exist
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
105 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
106
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
107 # Open the new log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
108 with open(new_log_filename, 'wb') as out_file:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
109 # Loop the logs to be combined
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
110 for log_file in log_files:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
111 # Confirm the log file exists, raise an error if not
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
112 if not os.path.isfile(log_file):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
113 raise IOError('%s not found' % log_file)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
114 # Open the log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
115 with open(log_file, 'rb') as in_file:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
116 # Copy the contents of the file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
117 shutil.copyfileobj(in_file, out_file)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
118
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
119 # Remove old log files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
120 for log_file in log_files:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
121 os.remove(log_file)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
122
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
123 def assign_vcf_extension (filename):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
124 # Checks if the file is unzipped, bgzipped, or gzipped
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
125 vcfname_format = checkFormat(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
126
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
127 if vcfname_format == 'vcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
128 return '.vcf'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
129
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
130 elif vcfname_format == 'gzip' or vcfname_format == 'bgzip':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
131 return '.vcf.gz'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
132
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
133 elif vcfname_format == 'bcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
134 raise Exception('BCF not supported in current version')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
135
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
136 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
137 raise Exception('Unknown file format')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
138
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
139 def run (passed_arguments = []):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
140 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
141 Phaser for VCF files.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
142
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
143 Automates the phasing process for a specified VCF file. The function
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
144 allows users to select between multiple phasing algorithms: beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
145 (default) and shapit.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
146
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
147 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
148 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
149 --vcf : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
150 Specifies the input VCF filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
151 --phase-algorithm : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
152 Specifies the algorithm to be used. Choices: beagle (default) and
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
153 shapit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
154 --out : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
155 Specifies the output filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
156
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
157 Returns
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
158 -------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
159 output : file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
160 Phased VCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
161
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
162 Raises
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
163 ------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
164 IOError
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
165 Input VCF file does not exist
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
166 IOError
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
167 Output file already exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
168
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
169 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
170
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
171 # Grab VCF arguments from command line
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
172 phase_args = phase_argument_parser(passed_arguments)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
173
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
174 # Adds the arguments (i.e. parameters) to the log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
175 logArgs(phase_args, func_name = 'vcf_phase')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
176
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
177 # Assign file extension for VCF input file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
178 vcfname_ext = assign_vcf_extension(phase_args.vcf)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
179
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
180 logging.info('Input file assigned')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
181
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
182 # Used to confirm if the VCF input was renamed
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
183 vcfname_renamed = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
184
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
185 # Confirm input has correct file extension
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
186 if vcfname_ext not in phase_args.vcf:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
187 vcfname_renamed = True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
188 os.rename(phase_args.vcf, phase_args.vcf + vcfname_ext)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
189 phase_args.vcf += vcfname_ext
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
190
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
191 # List to hold beagle/shapeit arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
192 phase_call_args = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
193
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
194 # bool to check if an eclude file was created
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
195 exclude_file_created = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
196
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
197 # bool to check if an include file was created
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
198 include_file_created = False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
199
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
200 # Performs actions related to --model-file and argument assignment
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
201 if phase_args.model_file:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
202
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
203 # Check if a model has been specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
204 if not phase_args.model:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
205 raise IOError('No selected model. Please use --model to select a model')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
206
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
207 # Read in the model file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
208 models_file = read_model_file(phase_args.model_file)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
209
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
210 # Check that the selected model was found in the file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
211 if phase_args.model not in models_file:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
212 raise IOError('Selected model "%s" not found in: %s' % (phase_args.model, phase_args.model_file))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
213
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
214 logging.info('Model assigned')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
215
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
216 if phase_args.phase_algorithm == 'beagle':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
217
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
218 # Get list of individuals to include
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
219 inds_in_model = models_file[phase_args.model].inds
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
220
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
221 # Create file with individuals to exclude (beagle has no include option)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
222 models_file.create_exclude_ind_file(inds_to_include = inds_in_model, overwrite = phase_args.overwrite)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
223
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
224 # Assign exclude file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
225 phase_call_args.append('excludesamples=' + models_file.exclude_file)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
226
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
227 # Confirm that the exclude file was created
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
228 exclude_file_created = True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
229
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
230 elif phase_args.phase_algorithm == 'shapeit':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
231
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
232 # Assign the model
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
233 selected_model = models_file[phase_args.model]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
234
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
235 # Create file with individuals to exclude (beagle has no include option)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
236 selected_model.create_ind_file(overwrite = phase_args.overwrite)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
237
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
238 # Assign exclude file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
239 phase_call_args.extend(['--include-ind', selected_model.ind_file])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
240
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
241 # Confirm that the include file was created
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
242 include_file_created = True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
243
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
244 # Get the list of chromosomes within the VCF
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
245 chrs_in_vcf = pipe_bcftools_to_chr(phase_args.vcf)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
246
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
247 # Check if the user specified a specific chromosome
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
248 if phase_args.phase_chr:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
249 # Check if the chromosome is within the file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
250 if phase_args.phase_chr not in chrs_in_vcf:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
251 raise Exception('--phase-chr %s not found in %s' % (phase_args.phase_chr, phase_args.vcf))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
252
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
253 # Check that a chr was specified if a bp-based argument was specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
254 if (phase_args.phase_from_bp or phase_args.phase_to_bp) and not phase_args.phase_chr:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
255 # Check if the file has a single chromosome
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
256 if len(chrs_in_vcf) == 1:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
257 # Check if the beagle algorithm is being used
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
258 if phase_args.phase_algorithm == 'beagle':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
259 # Assign the chromosome, as beagle requires it to be assigned
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
260 phase_args.phase_chr = chrs_in_vcf[0]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
261
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
262 logging.info('Assigned chr %s for beagle' % chrs_in_vcf[0])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
263 # Report error if multiple chromosomes are within the file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
264 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
265 raise Exception('The --phase-from-bp and --phase-to-bp arguments '
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
266 'require the --phase-chr argument to function if '
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
267 'multiple chrs are within %s' % phase_args.vcf)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
268
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
269 # Assign general arguments and call beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
270 if phase_args.phase_algorithm == 'beagle':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
271
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
272 # Assigns the input and output arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
273 phase_call_args.extend(['gt=' + phase_args.vcf,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
274 'out=' + phase_args.out_prefix])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
275
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
276 # Assign the burn-in iter, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
277 if phase_args.beagle_burn_iter:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
278 phase_call_args.append('burnin=' + phase_args.beagle_burn_iter)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
279
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
280 # Assign the iter, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
281 if phase_args.beagle_iter:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
282 phase_call_args.append('iterations=' + phase_args.beagle_iter)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
283
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
284 # Assign the state count, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
285 if phase_args.beagle_states:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
286 phase_call_args.append('phase-states=' + phase_args.beagle_states)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
287
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
288 # Assign the window length, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
289 if phase_args.beagle_window:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
290 phase_call_args.append('window=' + phase_args.beagle_window)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
291
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
292 # Assign the window overlap, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
293 if phase_args.beagle_overlap:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
294 phase_call_args.append('overlap=' + phase_args.beagle_overlap)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
295
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
296 # Assign the HMM error, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
297 if phase_args.beagle_error:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
298 phase_call_args.append('err=' + phase_args.beagle_error)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
299
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
300 # Assign the step length, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
301 if phase_args.beagle_step:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
302 phase_call_args.append('step=' + phase_args.beagle_step)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
303
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
304 # Assign the number of steps, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
305 if phase_args.beagle_nsteps:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
306 phase_call_args.append('nsteps=' + phase_args.beagle_nsteps)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
307
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
308 # Assign the genetic map, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
309 if phase_args.genetic_map:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
310 phase_call_args.append('map=' + phase_args.genetic_map)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
311
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
312 # Assign the effective pop size, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
313 if phase_args.Ne:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
314 phase_call_args.append('ne=' + phase_args.Ne)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
315
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
316 # Assign the random seed, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
317 if phase_args.random_seed:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
318 phase_call_args.append('seed=' + phase_args.random_seed)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
319
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
320 # Assign the chromosome to phase, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
321 if phase_args.phase_chr:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
322
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
323 # Store the chromosome argument
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
324 chr_arg = 'chrom=%s' % phase_args.phase_chr
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
325
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
326 # Check if either bp position arguments were specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
327 if phase_args.phase_from_bp or phase_args.phase_to_bp:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
328
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
329 # List of the position arguments, in their required order
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
330 position_args = [':', phase_args.phase_from_bp, '-', phase_args.phase_to_bp]
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
331
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
332 # Filter the position arguments to remove empty values
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
333 filttered_position_args = filter(None, position_args)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
334
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
335 # Map the arguments to str and add them to the chromosome argument
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
336 chr_arg += ''.join(map(str, filttered_position_args))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
337
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
338 # Add the chromosome argument the argument list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
339 phase_call_args.append(chr_arg)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
340
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
341 # Call beagle wrapper
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
342 call_beagle(phase_args.beagle_path, list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
343
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
344 # Assign expected phased output filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
345 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
346
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
347 # Rename output to phase_args.out, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
348 if phase_args.out:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
349 shutil.move(phased_output, phase_args.out)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
350 shutil.move(phase_args.out_prefix + '.log', phase_args.out + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
351 # Rename log using phased_output
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
352 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
353 shutil.move(phase_args.out_prefix + '.log', phased_output + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
354
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
355 logging.info('beagle log file created')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
356
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
357 # Assign general arguments and call shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
358 elif phase_args.phase_algorithm == 'shapeit':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
359
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
360 # Assign the shapeit burn in iter, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
361 if phase_args.shapeit_burn_iter:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
362 phase_call_args.extend(['--burn', phase_args.shapeit_burn_iter])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
363
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
364 # Assign the shapeit prune iter, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
365 if phase_args.shapeit_prune_iter:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
366 phase_call_args.extend(['--prune', phase_args.shapeit_prune_iter])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
367
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
368 # Assign the shapeit main iter, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
369 if phase_args.shapeit_main_iter:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
370 phase_call_args.extend(['--main', phase_args.shapeit_main_iter])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
371
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
372 # Assign the number of shapeit states if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
373 if phase_args.shapeit_states:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
374 phase_call_args.extend(['--states', phase_args.shapeit_states])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
375
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
376 # Assign the shapeit window size, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
377 if phase_args.shapeit_window:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
378 phase_call_args.extend(['--window', phase_args.shapeit_window])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
379
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
380 # Assign the genetic map, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
381 if phase_args.genetic_map:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
382 phase_call_args.extend(['--input-map', phase_args.genetic_map])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
383
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
384 # Assign the effective pop size, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
385 if phase_args.Ne:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
386 phase_call_args.extend(['--effective-size', phase_args.Ne])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
387
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
388 # Assign the random seed, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
389 if phase_args.random_seed:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
390 phase_call_args.extend(['--seed', phase_args.random_seed])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
391
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
392 # Check if only a single shapeit run is required
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
393 if phase_args.phase_chr or len(chrs_in_vcf) == 1:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
394
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
395 # Holds shapeit input filename, as a temp file may be required
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
396 shapeit_input_vcf = None
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
397
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
398 # Check if a single chromosome is selected
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
399 if phase_args.phase_chr and len(chrs_in_vcf) > 1:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
400 # Assign the chromosome-specific input prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
401 chr_input_prefix = phase_args.vcf + '.' + phase_args.phase_chr
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
402
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
403 # Assign the chr-specific file as the shapeit input filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
404 shapeit_input_vcf = chr_input_prefix + '.vcf.gz'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
405
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
406 # Create the chromosome-specific input
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
407 chr_subset_file(phase_args.vcf,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
408 phase_args.phase_chr,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
409 chr_input_prefix,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
410 'vcf.gz',
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
411 from_bp = phase_args.phase_from_bp,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
412 to_bp = phase_args.phase_to_bp)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
413
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
414 logging.info('Chr %s subset created' % phase_args.phase_chr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
415
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
416 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
417 # Assign the shapeit input filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
418 shapeit_input_vcf = phase_args.vcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
419
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
420 # Assigns the input and output arguments for shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
421 phase_call_args.extend(['--input-vcf', shapeit_input_vcf,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
422 '--output-max', phase_args.out_prefix,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
423 '--output-log', phase_args.out_prefix + '.phase.log'])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
424
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
425 # Assign the from bp position argument, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
426 if phase_args.phase_from_bp:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
427 phase_call_args.extend(['--input-from', phase_args.phase_from_bp])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
428
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
429 # Assign the to bp position argument, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
430 if phase_args.phase_to_bp:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
431 phase_call_args.extend(['--input-to', phase_args.phase_to_bp])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
432
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
433 # Call shapeit wrapper
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
434 call_shapeit(list(map(str, phase_call_args)), phase_args.out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
435
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
436 # Assign expected phased output filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
437 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
438
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
439 # Combine the log files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
440 concatenate_logs([phase_args.out_prefix + '.phase.log', phase_args.out_prefix + '.log'], phased_output + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
441
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
442 # Rename output to phase_args.out, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
443 if phase_args.out:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
444 shutil.move(phased_output, phase_args.out)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
445 shutil.move(phased_output + '.log', phase_args.out + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
446
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
447 logging.info('shapeit log file created')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
448
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
449 # Check if a chr subset file was created
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
450 if phase_args.phase_chr and len(chrs_in_vcf) > 1:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
451 # Delete the chromosome-specific input
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
452 os.remove(shapeit_input_vcf)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
453
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
454 logging.info('Chr %s subset deleted' % phase_args.phase_chr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
455
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
456 # Remove intermediate files created by shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
457 remove_intermediate_files(phase_args.out_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
458
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
459 # Check if multiple shapeit runs are required
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
460 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
461
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
462 # List to store the phased filenames
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
463 phased_filename_list = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
464
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
465 # List to store the phased logs
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
466 phased_log_list = []
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
467
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
468 logging.info('Multi-chr shapeit phasing assigned')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
469
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
470 for chr_in_vcf in chrs_in_vcf:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
471
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
472 logging.info('Chr %s assigned' % chr_in_vcf)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
473
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
474 # Copy the arguments for this run
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
475 chr_call_args = copy.deepcopy(phase_call_args)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
476
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
477 # Assign the chromosome-specific output prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
478 chr_out_prefix = phase_args.out_prefix + '.' + chr_in_vcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
479
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
480 # Assign the expected chromosome-specific output filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
481 chr_out_filename = '%s.%s' % (chr_out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
482
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
483 # Store the output filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
484 phased_filename_list.append(chr_out_filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
485
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
486 # Assign the chromosome-specific input prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
487 chr_input_prefix = phase_args.vcf + '.' + chr_in_vcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
488
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
489 # Assign the expected chromosome-specific input filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
490 chr_in_filename = chr_input_prefix + '.vcf.gz'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
491
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
492 # Create the chromosome-specific input
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
493 chr_subset_file(phase_args.vcf,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
494 chr_in_vcf,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
495 chr_input_prefix,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
496 'vcf.gz')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
497
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
498 # Assigns the input and output arguments for shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
499 chr_call_args.extend(['--input-vcf', chr_in_filename,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
500 '--output-max', chr_out_prefix,
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
501 '--output-log', chr_out_prefix + '.phase.log'])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
502
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
503 # Call shapeit wrapper
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
504 call_shapeit(list(map(str, chr_call_args)), chr_out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
505
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
506 # Combine log files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
507 concatenate_logs([chr_out_prefix + '.phase.log', chr_out_prefix + '.log'], chr_out_filename + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
508
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
509 # Store the filename of the combined logs
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
510 phased_log_list.append(chr_out_filename + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
511
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
512 # Delete the chromosome-specific input
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
513 os.remove(chr_in_filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
514
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
515 # Remove intermediate files created by shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
516 remove_intermediate_files(chr_out_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
517
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
518 # Concatenate the vcf files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
519 concatenate(phased_filename_list, phase_args.out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
520
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
521 logging.info('Concatenated chromosomes')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
522
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
523 # Assign expected concatenated output filename
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
524 phased_output = '%s.%s' % (phase_args.out_prefix, phase_args.out_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
525
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
526 # Combine the log files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
527 concatenate_logs(phased_log_list, phased_output + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
528
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
529 # Rename output to phase_args.out, if specified
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
530 if phase_args.out:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
531 shutil.move(phased_output, phase_args.out)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
532 shutil.move(phased_output + '.log', phase_args.out + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
533
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
534 logging.info('Multi-chr shapeit log created')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
535
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
536 # Reverts the VCF input file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
537 if vcfname_renamed:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
538 os.rename(phase_args.vcf, phase_args.vcf[:-len(vcfname_ext)])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
539
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
540 if __name__ == "__main__":
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
541 initLogger()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
542 run()