Mercurial > repos > jaredgk > ppp_vcfphase
comparison plink.py @ 4:901857c9b24f draft
Uploaded
author | jaredgk |
---|---|
date | Wed, 17 Oct 2018 17:30:37 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:d1e3db7f6521 | 4:901857c9b24f |
---|---|
1 import os | |
2 import sys | |
3 import subprocess | |
4 import shutil | |
5 import argparse | |
6 import glob | |
7 import copy | |
8 import logging | |
9 import re | |
10 | |
11 # Call PPP-based scripts | |
12 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) | |
13 | |
14 from bcftools import convert_to_bcf | |
15 from logging_module import initLogger, logArgs | |
16 | |
17 def assign_delim_from_ids (filename, id_column = 0, default_delim = '_'): | |
18 | |
19 delim_symbols = ['-', '|', '/', '@', '&', '*', '^', '%', '$'] | |
20 | |
21 symbols_present = [] | |
22 | |
23 with open(filename, 'rU') as sample_file: | |
24 for line_pos, sample_line in enumerate(sample_file): | |
25 if line_pos > 1: | |
26 # Stores the current sample ID | |
27 sample_id = sample_line.strip().split()[id_column] | |
28 # Symbols found in the sample ID | |
29 id_symbols = re.sub(r'[a-zA-Z0-9]','', sample_id) | |
30 # Iterate the symbols from the ID | |
31 for id_symbol in iter(id_symbols): | |
32 # Check if that symbol has been stored | |
33 if id_symbol not in symbols_present: | |
34 # Store symbols | |
35 symbols_present.append(id_symbol) | |
36 | |
37 if default_delim not in symbols_present: | |
38 return default_delim | |
39 else: | |
40 for delim_symbol in delim_symbols: | |
41 if delim_symbol not in symbols_present: | |
42 return delim_symbol | |
43 | |
44 raise IOError('Cannot assign plink delimiter') | |
45 | |
46 def convert_haps_to_vcf (haps_prefix, output_format, output_prefix = ''): | |
47 | |
48 # Check that input files included a haps file | |
49 if not os.path.isfile(haps_prefix + '.haps'): | |
50 # Return error message if no haps input was found | |
51 raise IOError('Unable to assign haps file. Please confirm the prefix and/or command is specified correctly') | |
52 | |
53 # Check that input files included a sample file | |
54 if not os.path.isfile(haps_prefix + '.sample'): | |
55 # Return error message if no sample input was found | |
56 raise IOError('Unable to assign sample file. Please confirm the prefix and/or command is specified correctly') | |
57 | |
58 haps_input_args = ['--haps', haps_prefix + '.haps', 'ref-first', '--sample', haps_prefix + '.sample'] | |
59 | |
60 # If no output_prefix is assigned, use haps_prefix | |
61 if not output_prefix: | |
62 output_prefix = haps_prefix | |
63 | |
64 plink_delim = assign_delim_from_ids(haps_prefix + '.sample') | |
65 | |
66 if output_format == 'vcf': | |
67 haps_output_args = ['--recode', 'vcf-iid', 'id-delim=' + plink_delim, '--out', output_prefix] | |
68 | |
69 elif output_format == 'vcf.gz': | |
70 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix] | |
71 | |
72 elif output_format == 'bcf': | |
73 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix] | |
74 | |
75 else: | |
76 raise IOError('Unknown file format. This error should NOT show up') | |
77 | |
78 # Call plink2 | |
79 standard_plink2_call(haps_input_args + haps_output_args) | |
80 | |
81 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF | |
82 if output_format == 'bcf': | |
83 # Convert to BCF | |
84 convert_to_bcf(output_prefix + '.vcf.gz', output_prefix) | |
85 | |
86 def check_plink_for_errors (plink_stderr): | |
87 ''' | |
88 Checks the plink stderr for errors | |
89 | |
90 Parameters | |
91 ---------- | |
92 plink_stderr : str | |
93 plink stderr | |
94 | |
95 Raises | |
96 ------ | |
97 IOError | |
98 If plink stderr returns an error | |
99 ''' | |
100 | |
101 # Print output if error found. Build up as errors are discovered | |
102 if plink_stderr: | |
103 raise Exception(plink_stderr) | |
104 | |
105 def standard_plink2_call (plink2_call_args): | |
106 ''' | |
107 Standard call of plink2 | |
108 | |
109 The function calls plink2. Returns the stderr of plink to create a log | |
110 file of the call. | |
111 | |
112 Parameters | |
113 ---------- | |
114 plink2_call_args : list | |
115 plink2 arguments | |
116 | |
117 Raises | |
118 ------ | |
119 Exception | |
120 If plink2 stderr returns an error | |
121 ''' | |
122 | |
123 | |
124 # plink subprocess call | |
125 plink2_call = subprocess.Popen(['plink2'] + list(map(str, plink2_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
126 | |
127 # Wait for plink to finish | |
128 plink2_out, plink2_err = plink2_call.communicate() | |
129 | |
130 # Check if code is running in python 3 | |
131 if sys.version_info[0] == 3: | |
132 # Convert bytes to string | |
133 plink2_out = plink2_out.decode() | |
134 plink2_err = plink2_err.decode() | |
135 | |
136 logging.info('plink2 call complete') | |
137 | |
138 # Check that the log file was created correctly | |
139 check_plink_for_errors(plink2_err) | |
140 | |
141 def standard_plink_call (plink_call_args): | |
142 ''' | |
143 Standard call of plink | |
144 | |
145 The function calls plink. Returns the stderr of plink to create a log | |
146 file of the call. | |
147 | |
148 Parameters | |
149 ---------- | |
150 plink_call_args : list | |
151 plink arguments | |
152 | |
153 Raises | |
154 ------ | |
155 Exception | |
156 If plink stderr returns an error | |
157 ''' | |
158 | |
159 # plink subprocess call | |
160 plink_call = subprocess.Popen(['plink'] + list(map(str, plink_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
161 | |
162 # Wait for plink to finish | |
163 plink_out, plink_err = plink_call.communicate() | |
164 | |
165 # Check if code is running in python 3 | |
166 if sys.version_info[0] == 3: | |
167 # Convert bytes to string | |
168 plink_out = plink_out.decode() | |
169 plink_err = plink_err.decode() | |
170 | |
171 logging.info('plink call complete') | |
172 | |
173 # Check that the log file was created correctly | |
174 check_plink_for_errors(plink_err) | |
175 | |
176 def call_plink (plink_call_args, output_prefix, output_format): | |
177 ''' | |
178 Determines which version of plink (plink1.9 vs. plink2a) to call | |
179 | |
180 The function is required for the plink wrapper to determine the needed | |
181 plink version and also automates conversion to BCF, which both versions | |
182 of plink do not support. | |
183 | |
184 Parameters | |
185 ---------- | |
186 plink_call_args : list | |
187 plink arguments | |
188 output_prefix : str | |
189 Output prefix for non-plink conversions | |
190 output_format : str | |
191 Output format for non-plink conversions | |
192 | |
193 ''' | |
194 | |
195 # Check if the call requries plink2 to operate. This function should be | |
196 # removed once plink2 is out of alpha | |
197 if '--haps' in plink_input_args: | |
198 # Convert vcf id system for plink 1.9 to plink 2 | |
199 if 'vcf-iid' in plink_call_args: | |
200 format_index = plink_call_args.index('vcf-iid') | |
201 plink_call_args[format_index:format_index + 1] = ['vcf', 'id-paste=fid'] | |
202 | |
203 # Call the plink2 subprocess function | |
204 standard_plink2_call(plink_call_args) | |
205 | |
206 # Call plink1.9 | |
207 else: | |
208 # Call the plink subprocess function | |
209 standard_plink_call(plink_call_args) | |
210 | |
211 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF | |
212 if output_format == 'bcf': | |
213 # Convert to BCF | |
214 convert_to_bcf(output_prefix + '.vcf.gz', output_format) |