annotate plink.py @ 5:86a9d8d5b291 draft default tip

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:34:34 -0400
parents 901857c9b24f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
1 import os
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
2 import sys
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
3 import subprocess
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
4 import shutil
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
5 import argparse
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
6 import glob
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
7 import copy
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
8 import logging
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
9 import re
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
10
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
11 # Call PPP-based scripts
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
12 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
13
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
14 from bcftools import convert_to_bcf
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
15 from logging_module import initLogger, logArgs
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
16
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
17 def assign_delim_from_ids (filename, id_column = 0, default_delim = '_'):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
18
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
19 delim_symbols = ['-', '|', '/', '@', '&', '*', '^', '%', '$']
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
20
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
21 symbols_present = []
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
22
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
23 with open(filename, 'rU') as sample_file:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
24 for line_pos, sample_line in enumerate(sample_file):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
25 if line_pos > 1:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
26 # Stores the current sample ID
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
27 sample_id = sample_line.strip().split()[id_column]
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
28 # Symbols found in the sample ID
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
29 id_symbols = re.sub(r'[a-zA-Z0-9]','', sample_id)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
30 # Iterate the symbols from the ID
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
31 for id_symbol in iter(id_symbols):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
32 # Check if that symbol has been stored
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
33 if id_symbol not in symbols_present:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
34 # Store symbols
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
35 symbols_present.append(id_symbol)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
36
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
37 if default_delim not in symbols_present:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
38 return default_delim
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
39 else:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
40 for delim_symbol in delim_symbols:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
41 if delim_symbol not in symbols_present:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
42 return delim_symbol
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
43
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
44 raise IOError('Cannot assign plink delimiter')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
45
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
46 def convert_haps_to_vcf (haps_prefix, output_format, output_prefix = ''):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
47
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
48 # Check that input files included a haps file
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
49 if not os.path.isfile(haps_prefix + '.haps'):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
50 # Return error message if no haps input was found
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
51 raise IOError('Unable to assign haps file. Please confirm the prefix and/or command is specified correctly')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
52
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
53 # Check that input files included a sample file
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
54 if not os.path.isfile(haps_prefix + '.sample'):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
55 # Return error message if no sample input was found
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
56 raise IOError('Unable to assign sample file. Please confirm the prefix and/or command is specified correctly')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
57
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
58 haps_input_args = ['--haps', haps_prefix + '.haps', 'ref-first', '--sample', haps_prefix + '.sample']
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
59
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
60 # If no output_prefix is assigned, use haps_prefix
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
61 if not output_prefix:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
62 output_prefix = haps_prefix
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
63
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
64 plink_delim = assign_delim_from_ids(haps_prefix + '.sample')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
65
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
66 if output_format == 'vcf':
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
67 haps_output_args = ['--recode', 'vcf-iid', 'id-delim=' + plink_delim, '--out', output_prefix]
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
68
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
69 elif output_format == 'vcf.gz':
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
70 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix]
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
71
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
72 elif output_format == 'bcf':
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
73 haps_output_args = ['--recode', 'vcf-iid', 'bgz', 'id-delim=' + plink_delim, '--out', output_prefix]
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
74
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
75 else:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
76 raise IOError('Unknown file format. This error should NOT show up')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
77
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
78 # Call plink2
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
79 standard_plink2_call(haps_input_args + haps_output_args)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
80
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
81 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
82 if output_format == 'bcf':
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
83 # Convert to BCF
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
84 convert_to_bcf(output_prefix + '.vcf.gz', output_prefix)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
85
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
86 def check_plink_for_errors (plink_stderr):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
87 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
88 Checks the plink stderr for errors
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
89
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
90 Parameters
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
91 ----------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
92 plink_stderr : str
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
93 plink stderr
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
94
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
95 Raises
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
96 ------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
97 IOError
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
98 If plink stderr returns an error
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
99 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
100
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
101 # Print output if error found. Build up as errors are discovered
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
102 if plink_stderr:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
103 raise Exception(plink_stderr)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
104
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
105 def standard_plink2_call (plink2_call_args):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
106 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
107 Standard call of plink2
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
108
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
109 The function calls plink2. Returns the stderr of plink to create a log
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
110 file of the call.
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
111
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
112 Parameters
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
113 ----------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
114 plink2_call_args : list
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
115 plink2 arguments
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
116
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
117 Raises
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
118 ------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
119 Exception
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
120 If plink2 stderr returns an error
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
121 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
122
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
123
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
124 # plink subprocess call
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
125 plink2_call = subprocess.Popen(['plink2'] + list(map(str, plink2_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
126
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
127 # Wait for plink to finish
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
128 plink2_out, plink2_err = plink2_call.communicate()
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
129
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
130 # Check if code is running in python 3
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
131 if sys.version_info[0] == 3:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
132 # Convert bytes to string
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
133 plink2_out = plink2_out.decode()
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
134 plink2_err = plink2_err.decode()
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
135
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
136 logging.info('plink2 call complete')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
137
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
138 # Check that the log file was created correctly
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
139 check_plink_for_errors(plink2_err)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
140
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
141 def standard_plink_call (plink_call_args):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
142 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
143 Standard call of plink
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
144
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
145 The function calls plink. Returns the stderr of plink to create a log
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
146 file of the call.
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
147
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
148 Parameters
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
149 ----------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
150 plink_call_args : list
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
151 plink arguments
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
152
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
153 Raises
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
154 ------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
155 Exception
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
156 If plink stderr returns an error
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
157 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
158
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
159 # plink subprocess call
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
160 plink_call = subprocess.Popen(['plink'] + list(map(str, plink_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
161
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
162 # Wait for plink to finish
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
163 plink_out, plink_err = plink_call.communicate()
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
164
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
165 # Check if code is running in python 3
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
166 if sys.version_info[0] == 3:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
167 # Convert bytes to string
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
168 plink_out = plink_out.decode()
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
169 plink_err = plink_err.decode()
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
170
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
171 logging.info('plink call complete')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
172
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
173 # Check that the log file was created correctly
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
174 check_plink_for_errors(plink_err)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
175
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
176 def call_plink (plink_call_args, output_prefix, output_format):
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
177 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
178 Determines which version of plink (plink1.9 vs. plink2a) to call
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
179
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
180 The function is required for the plink wrapper to determine the needed
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
181 plink version and also automates conversion to BCF, which both versions
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
182 of plink do not support.
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
183
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
184 Parameters
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
185 ----------
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
186 plink_call_args : list
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
187 plink arguments
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
188 output_prefix : str
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
189 Output prefix for non-plink conversions
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
190 output_format : str
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
191 Output format for non-plink conversions
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
192
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
193 '''
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
194
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
195 # Check if the call requries plink2 to operate. This function should be
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
196 # removed once plink2 is out of alpha
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
197 if '--haps' in plink_input_args:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
198 # Convert vcf id system for plink 1.9 to plink 2
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
199 if 'vcf-iid' in plink_call_args:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
200 format_index = plink_call_args.index('vcf-iid')
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
201 plink_call_args[format_index:format_index + 1] = ['vcf', 'id-paste=fid']
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
202
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
203 # Call the plink2 subprocess function
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
204 standard_plink2_call(plink_call_args)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
205
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
206 # Call plink1.9
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
207 else:
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
208 # Call the plink subprocess function
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
209 standard_plink_call(plink_call_args)
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
210
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
211 # Convert VCFGZ to BCF, as PLINK cannot directly create a BCF
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
212 if output_format == 'bcf':
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
213 # Convert to BCF
901857c9b24f Uploaded
jaredgk
parents:
diff changeset
214 convert_to_bcf(output_prefix + '.vcf.gz', output_format)