comparison plink.py @ 4:901857c9b24f draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:30:37 -0400
parents
children
comparison
equal deleted inserted replaced
3:d1e3db7f6521 4:901857c9b24f
1 import os
2 import sys
3 import subprocess
4 import shutil
5 import argparse
6 import glob
7 import copy
8 import logging
9 import re
10
11 # Call PPP-based scripts
12 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
13
14 from bcftools import convert_to_bcf
15 from logging_module import initLogger, logArgs
16
17 def assign_delim_from_ids (filename, id_column = 0, default_delim = '_'):
18
19 delim_symbols = ['-', '|', '/', '@', '&', '*', '^', '%', '$']
20
21 symbols_present = []
22
23 with open(filename, 'rU') as sample_file:
24 for line_pos, sample_line in enumerate(sample_file):
25 if line_pos > 1:
26 # Stores the current sample ID
27 sample_id = sample_line.strip().split()[id_column]
28 # Symbols found in the sample ID
29 id_symbols = re.sub(r'[a-zA-Z0-9]','', sample_id)
30 # Iterate the symbols from the ID
31 for id_symbol in iter(id_symbols):
32 # Check if that symbol has been stored
33 if id_symbol not in symbols_present:
34 # Store symbols
35 symbols_present.append(id_symbol)
36
37 if default_delim not in symbols_present:
38 return default_delim
39 else:
40 for delim_symbol in delim_symbols:
41 if delim_symbol not in symbols_present:
42 return delim_symbol
43
44 raise IOError('Cannot assign plink delimiter')
45
46 def convert_haps_to_vcf (haps_prefix, output_format, output_prefix = ''):
47
48 # Check that input files included a haps file
49 if not os.path.isfile(haps_prefix + '.haps'):
50 # Return error message if no haps input was found
51 raise IOError('Unable to assign haps file. Please confirm the prefix and/or command is specified correctly')
52
53 # Check that input files included a sample file
54 if not os.path.isfile(haps_prefix + '.sample'):
55 # Return error message if no sample input was found
56 raise IOError('Unable to assign sample file. Please confirm the prefix and/or command is specified correctly')
57
58 haps_input_args = ['--haps', haps_prefix + '.haps', 'ref-first', '--sample', haps_prefix + '.sample']
59
60 # If no output_prefix is assigned, use haps_prefix
61 if not output_prefix:
62 output_prefix = haps_prefix
63
64 plink_delim = assign_delim_from_ids(haps_prefix + '.sample')
65
66 if output_format == 'vcf':
67 haps_output_args = ['--recode', 'vcf-iid', 'id-delim=' + plink_delim, '--out', output_prefix]
68
69 elif output_format == 'vcf.gz':
70 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix]
71
72 elif output_format == 'bcf':
73 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix]
74
75 else:
76 raise IOError('Unknown file format. This error should NOT show up')
77
78 # Call plink2
79 standard_plink2_call(haps_input_args + haps_output_args)
80
81 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF
82 if output_format == 'bcf':
83 # Convert to BCF
84 convert_to_bcf(output_prefix + '.vcf.gz', output_prefix)
85
86 def check_plink_for_errors (plink_stderr):
87 '''
88 Checks the plink stderr for errors
89
90 Parameters
91 ----------
92 plink_stderr : str
93 plink stderr
94
95 Raises
96 ------
97 IOError
98 If plink stderr returns an error
99 '''
100
101 # Print output if error found. Build up as errors are discovered
102 if plink_stderr:
103 raise Exception(plink_stderr)
104
105 def standard_plink2_call (plink2_call_args):
106 '''
107 Standard call of plink2
108
109 The function calls plink2. Returns the stderr of plink to create a log
110 file of the call.
111
112 Parameters
113 ----------
114 plink2_call_args : list
115 plink2 arguments
116
117 Raises
118 ------
119 Exception
120 If plink2 stderr returns an error
121 '''
122
123
124 # plink subprocess call
125 plink2_call = subprocess.Popen(['plink2'] + list(map(str, plink2_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
126
127 # Wait for plink to finish
128 plink2_out, plink2_err = plink2_call.communicate()
129
130 # Check if code is running in python 3
131 if sys.version_info[0] == 3:
132 # Convert bytes to string
133 plink2_out = plink2_out.decode()
134 plink2_err = plink2_err.decode()
135
136 logging.info('plink2 call complete')
137
138 # Check that the log file was created correctly
139 check_plink_for_errors(plink2_err)
140
141 def standard_plink_call (plink_call_args):
142 '''
143 Standard call of plink
144
145 The function calls plink. Returns the stderr of plink to create a log
146 file of the call.
147
148 Parameters
149 ----------
150 plink_call_args : list
151 plink arguments
152
153 Raises
154 ------
155 Exception
156 If plink stderr returns an error
157 '''
158
159 # plink subprocess call
160 plink_call = subprocess.Popen(['plink'] + list(map(str, plink_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
161
162 # Wait for plink to finish
163 plink_out, plink_err = plink_call.communicate()
164
165 # Check if code is running in python 3
166 if sys.version_info[0] == 3:
167 # Convert bytes to string
168 plink_out = plink_out.decode()
169 plink_err = plink_err.decode()
170
171 logging.info('plink call complete')
172
173 # Check that the log file was created correctly
174 check_plink_for_errors(plink_err)
175
176 def call_plink (plink_call_args, output_prefix, output_format):
177 '''
178 Determines which version of plink (plink1.9 vs. plink2a) to call
179
180 The function is required for the plink wrapper to determine the needed
181 plink version and also automates conversion to BCF, which both versions
182 of plink do not support.
183
184 Parameters
185 ----------
186 plink_call_args : list
187 plink arguments
188 output_prefix : str
189 Output prefix for non-plink conversions
190 output_format : str
191 Output format for non-plink conversions
192
193 '''
194
195 # Check if the call requries plink2 to operate. This function should be
196 # removed once plink2 is out of alpha
197 if '--haps' in plink_input_args:
198 # Convert vcf id system for plink 1.9 to plink 2
199 if 'vcf-iid' in plink_call_args:
200 format_index = plink_call_args.index('vcf-iid')
201 plink_call_args[format_index:format_index + 1] = ['vcf', 'id-paste=fid']
202
203 # Call the plink2 subprocess function
204 standard_plink2_call(plink_call_args)
205
206 # Call plink1.9
207 else:
208 # Call the plink subprocess function
209 standard_plink_call(plink_call_args)
210
211 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF
212 if output_format == 'bcf':
213 # Convert to BCF
214 convert_to_bcf(output_prefix + '.vcf.gz', output_format)