4
|
1 import os
|
|
2 import sys
|
|
3 import subprocess
|
|
4 import shutil
|
|
5 import argparse
|
|
6 import glob
|
|
7 import copy
|
|
8 import logging
|
|
9 import re
|
|
10
|
|
11 # Call PPP-based scripts
|
|
12 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
|
|
13
|
|
14 from bcftools import convert_to_bcf
|
|
15 from logging_module import initLogger, logArgs
|
|
16
|
|
17 def assign_delim_from_ids (filename, id_column = 0, default_delim = '_'):
|
|
18
|
|
19 delim_symbols = ['-', '|', '/', '@', '&', '*', '^', '%', '$']
|
|
20
|
|
21 symbols_present = []
|
|
22
|
|
23 with open(filename, 'rU') as sample_file:
|
|
24 for line_pos, sample_line in enumerate(sample_file):
|
|
25 if line_pos > 1:
|
|
26 # Stores the current sample ID
|
|
27 sample_id = sample_line.strip().split()[id_column]
|
|
28 # Symbols found in the sample ID
|
|
29 id_symbols = re.sub(r'[a-zA-Z0-9]','', sample_id)
|
|
30 # Iterate the symbols from the ID
|
|
31 for id_symbol in iter(id_symbols):
|
|
32 # Check if that symbol has been stored
|
|
33 if id_symbol not in symbols_present:
|
|
34 # Store symbols
|
|
35 symbols_present.append(id_symbol)
|
|
36
|
|
37 if default_delim not in symbols_present:
|
|
38 return default_delim
|
|
39 else:
|
|
40 for delim_symbol in delim_symbols:
|
|
41 if delim_symbol not in symbols_present:
|
|
42 return delim_symbol
|
|
43
|
|
44 raise IOError('Cannot assign plink delimiter')
|
|
45
|
|
46 def convert_haps_to_vcf (haps_prefix, output_format, output_prefix = ''):
|
|
47
|
|
48 # Check that input files included a haps file
|
|
49 if not os.path.isfile(haps_prefix + '.haps'):
|
|
50 # Return error message if no haps input was found
|
|
51 raise IOError('Unable to assign haps file. Please confirm the prefix and/or command is specified correctly')
|
|
52
|
|
53 # Check that input files included a sample file
|
|
54 if not os.path.isfile(haps_prefix + '.sample'):
|
|
55 # Return error message if no sample input was found
|
|
56 raise IOError('Unable to assign sample file. Please confirm the prefix and/or command is specified correctly')
|
|
57
|
|
58 haps_input_args = ['--haps', haps_prefix + '.haps', 'ref-first', '--sample', haps_prefix + '.sample']
|
|
59
|
|
60 # If no output_prefix is assigned, use haps_prefix
|
|
61 if not output_prefix:
|
|
62 output_prefix = haps_prefix
|
|
63
|
|
64 plink_delim = assign_delim_from_ids(haps_prefix + '.sample')
|
|
65
|
|
66 if output_format == 'vcf':
|
|
67 haps_output_args = ['--recode', 'vcf-iid', 'id-delim=' + plink_delim, '--out', output_prefix]
|
|
68
|
|
69 elif output_format == 'vcf.gz':
|
|
70 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix]
|
|
71
|
|
72 elif output_format == 'bcf':
|
|
73 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix]
|
|
74
|
|
75 else:
|
|
76 raise IOError('Unknown file format. This error should NOT show up')
|
|
77
|
|
78 # Call plink2
|
|
79 standard_plink2_call(haps_input_args + haps_output_args)
|
|
80
|
|
81 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF
|
|
82 if output_format == 'bcf':
|
|
83 # Convert to BCF
|
|
84 convert_to_bcf(output_prefix + '.vcf.gz', output_prefix)
|
|
85
|
|
86 def check_plink_for_errors (plink_stderr):
|
|
87 '''
|
|
88 Checks the plink stderr for errors
|
|
89
|
|
90 Parameters
|
|
91 ----------
|
|
92 plink_stderr : str
|
|
93 plink stderr
|
|
94
|
|
95 Raises
|
|
96 ------
|
|
97 IOError
|
|
98 If plink stderr returns an error
|
|
99 '''
|
|
100
|
|
101 # Print output if error found. Build up as errors are discovered
|
|
102 if plink_stderr:
|
|
103 raise Exception(plink_stderr)
|
|
104
|
|
105 def standard_plink2_call (plink2_call_args):
|
|
106 '''
|
|
107 Standard call of plink2
|
|
108
|
|
109 The function calls plink2. Returns the stderr of plink to create a log
|
|
110 file of the call.
|
|
111
|
|
112 Parameters
|
|
113 ----------
|
|
114 plink2_call_args : list
|
|
115 plink2 arguments
|
|
116
|
|
117 Raises
|
|
118 ------
|
|
119 Exception
|
|
120 If plink2 stderr returns an error
|
|
121 '''
|
|
122
|
|
123
|
|
124 # plink subprocess call
|
|
125 plink2_call = subprocess.Popen(['plink2'] + list(map(str, plink2_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
126
|
|
127 # Wait for plink to finish
|
|
128 plink2_out, plink2_err = plink2_call.communicate()
|
|
129
|
|
130 # Check if code is running in python 3
|
|
131 if sys.version_info[0] == 3:
|
|
132 # Convert bytes to string
|
|
133 plink2_out = plink2_out.decode()
|
|
134 plink2_err = plink2_err.decode()
|
|
135
|
|
136 logging.info('plink2 call complete')
|
|
137
|
|
138 # Check that the log file was created correctly
|
|
139 check_plink_for_errors(plink2_err)
|
|
140
|
|
141 def standard_plink_call (plink_call_args):
|
|
142 '''
|
|
143 Standard call of plink
|
|
144
|
|
145 The function calls plink. Returns the stderr of plink to create a log
|
|
146 file of the call.
|
|
147
|
|
148 Parameters
|
|
149 ----------
|
|
150 plink_call_args : list
|
|
151 plink arguments
|
|
152
|
|
153 Raises
|
|
154 ------
|
|
155 Exception
|
|
156 If plink stderr returns an error
|
|
157 '''
|
|
158
|
|
159 # plink subprocess call
|
|
160 plink_call = subprocess.Popen(['plink'] + list(map(str, plink_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
161
|
|
162 # Wait for plink to finish
|
|
163 plink_out, plink_err = plink_call.communicate()
|
|
164
|
|
165 # Check if code is running in python 3
|
|
166 if sys.version_info[0] == 3:
|
|
167 # Convert bytes to string
|
|
168 plink_out = plink_out.decode()
|
|
169 plink_err = plink_err.decode()
|
|
170
|
|
171 logging.info('plink call complete')
|
|
172
|
|
173 # Check that the log file was created correctly
|
|
174 check_plink_for_errors(plink_err)
|
|
175
|
|
176 def call_plink (plink_call_args, output_prefix, output_format):
|
|
177 '''
|
|
178 Determines which version of plink (plink1.9 vs. plink2a) to call
|
|
179
|
|
180 The function is required for the plink wrapper to determine the needed
|
|
181 plink version and also automates conversion to BCF, which both versions
|
|
182 of plink do not support.
|
|
183
|
|
184 Parameters
|
|
185 ----------
|
|
186 plink_call_args : list
|
|
187 plink arguments
|
|
188 output_prefix : str
|
|
189 Output prefix for non-plink conversions
|
|
190 output_format : str
|
|
191 Output format for non-plink conversions
|
|
192
|
|
193 '''
|
|
194
|
|
195 # Check if the call requries plink2 to operate. This function should be
|
|
196 # removed once plink2 is out of alpha
|
|
197 if '--haps' in plink_input_args:
|
|
198 # Convert vcf id system for plink 1.9 to plink 2
|
|
199 if 'vcf-iid' in plink_call_args:
|
|
200 format_index = plink_call_args.index('vcf-iid')
|
|
201 plink_call_args[format_index:format_index + 1] = ['vcf', 'id-paste=fid']
|
|
202
|
|
203 # Call the plink2 subprocess function
|
|
204 standard_plink2_call(plink_call_args)
|
|
205
|
|
206 # Call plink1.9
|
|
207 else:
|
|
208 # Call the plink subprocess function
|
|
209 standard_plink_call(plink_call_args)
|
|
210
|
|
211 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF
|
|
212 if output_format == 'bcf':
|
|
213 # Convert to BCF
|
|
214 convert_to_bcf(output_prefix + '.vcf.gz', output_format)
|