annotate vcftools.py @ 4:901857c9b24f draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:30:37 -0400
parents d1e3db7f6521
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
1 import os
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
2 import sys
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
3 import logging
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
4 import subprocess
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
5
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
7
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
8 from vcf_reader_func import checkFormat
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
9 from bcftools import check_bcftools_for_errors
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
10
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
11 def check_bgzip_for_errors (bgzip_stderr):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
12 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
13 Checks the bgzip stderr for errors
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
14
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
15 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
16 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
17 bgzip_stderr : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
18 bgzip stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
19
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
20 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
21 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
22 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
23 If bgzip stderr returns an error
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
24 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
25
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
26 if bgzip_stderr:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
27 raise IOError('Error occured while compressing the vcf file')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
28
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
29 def bgzip_decompress_vcfgz (vcfgz_filename, out_prefix = '', keep_original = False):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
30 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
31 Converts a vcf.gz to vcf
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
32
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
33 The function automates bgzip to decompress a vcf.gz file into a vcf
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
34
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
35 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
36 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
37 vcfgz_filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
38 The file name of the vcf.gz file to be decompressed
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
39 out_prefix : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
40 Output file prefix (i.e. filename without extension)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
41 keep_original : bool
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
42 Specifies if the original file should be kept
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
43
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
44 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
45 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
46 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
47 Error in creating the compressed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
48 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
49
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
50 # Run bgzip with stdout piped to file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
51 if keep_original or out_prefix:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
52
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
53 if out_prefix:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
54
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
55 # Assign the bgzip filename
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
56 vcf_filename = out_prefix + '.vcf'
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
57
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
58 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
59
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
60 # Seperate into path and filename
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
61 split_path, split_filename = os.path.split(vcfgz_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
62
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
63 # Remove any file extensions
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
64 vcf_basename = split_filename.split(os.extsep)[0] + '.vcf'
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
65
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
66 # Join path and filename
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
67 vcf_filename = os.path.join(split_path, vcf_basename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
68
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
69 # Create the output file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
70 vcf_file = open(vcf_filename, 'w')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
71
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
72 # bgzip subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
73 bgzip_call = subprocess.Popen(['bgzip', '-dc', vcfgz_filename], stdout = vcf_file, stderr = subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
74
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
75 # Run bgzip normally
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
76 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
77
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
78 # bgzip subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
79 bgzip_call = subprocess.Popen(['bgzip', '-d', vcfgz_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
80
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
81 # Save the stdout and stderr from bgzip
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
82 bgzip_out, bgzip_err = bgzip_call.communicate()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
83
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
84 # Check that output file was compressed correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
85 check_bgzip_for_errors(bgzip_err)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
86
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
87 # Delete input when also using an output prefix
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
88 if out_prefix and not keep_original:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
89 os.remove(vcfgz_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
90
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
91 def bgzip_compress_vcf (vcf_filename, out_prefix = '', keep_original = False):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
92 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
93 Converts a vcf to vcf.gz
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
94
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
95 The function automates bgzip to compress a vcf file into a vcf.gz
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
96
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
97 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
98 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
99 vcf_filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
100 The file name of the vcf file to be compressed
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
101 keep_original : bool
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
102 Specifies if the original file should be kept
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
103
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
104 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
105 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
106 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
107 Error in creating the compressed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
108 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
109
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
110 # Compress and keep the original file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
111 if keep_original or out_prefix:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
112
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
113 if out_prefix:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
114
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
115 # Assign the filename
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
116 vcfgz_filename = out_prefix + '.vcf.gz'
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
117
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
118 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
119
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
120 # Seperate into path and filename
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
121 split_path, split_filename = os.path.split(vcfgz_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
122
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
123 # Remove any file extensions
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
124 vcfgz_basename = split_filename.split(os.extsep)[0] + '.vcf.gz'
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
125
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
126 # Join path and filename
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
127 vcfgz_filename = os.path.join(split_path, vcfgz_basename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
128
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
129
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
130 # Create the output file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
131 vcfgz_file = open(vcfgz_filename, 'w')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
132
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
133 # bgzip subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
134 bgzip_call = subprocess.Popen(['bgzip', '-c', vcf_filename], stdout = vcfgz_file, stderr = subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
135
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
136 # Compress and do not keep the original file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
137 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
138
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
139 # bgzip subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
140 bgzip_call = subprocess.Popen(['bgzip', vcf_filename], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
141
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
142 # Save the stdout and stderr from bgzip
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
143 bgzip_out, bgzip_err = bgzip_call.communicate()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
144
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
145 # Check that output file was compressed correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
146 check_bgzip_for_errors(bgzip_err)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
147
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
148 def cvt_vcftools_site_to_bed (vcftools_out_str):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
149 # Check if str in the header
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
150 if 'CHROM' not in vcftools_out_str or 'POS' not in vcftools_out_str:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
151 # Split the line into a list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
152 vcftools_out_data = vcftools_out_str.strip().split('\t')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
153 # Convert the chromStart to int
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
154 vcftools_out_data[1] = int(vcftools_out_data[1])
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
155 # Calc chromEnd
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
156 chrom_end = vcftools_out_data[1] + 1
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
157 # Add chrom_end to the list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
158 vcftools_out_data = vcftools_out_data + [chrom_end]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
159 # Return the list as a string (with newline element)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
160 return '\t'.join(map(str, vcftools_out_data)) + '\n'
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
161 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
162 # Remove the header
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
163 return ''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
164
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
165 def pipe_vcftools (vcftools_call_args):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
166 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
167 Calls vcftools with pipe output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
168
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
169 The output of this function is the stdout and stderr of vcftools. This
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
170 function should only be used if vcftools is being used as the stdin of
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
171 another function. Please note that this function does not check the for
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
172 errors in the vcftools call. Please check for errors after the call is
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
173 closed using check_vcftools_for_errors.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
174
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
175 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
176 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
177 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
178 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
179
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
180 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
181 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
182 vcftools_call : subprocess.Popen
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
183 vcftools subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
184 vcftools_call.stdout : PIPE
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
185 vcftools stdout PIPE (Results)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
186 vcftools_call.stderr : PIPE
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
187 vcftools stderr PIPE (Log)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
188
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
189 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
190
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
191 # vcftools subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
192 vcftools_call = subprocess.Popen(['vcftools', '--stdout'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
193
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
194 return vcftools_call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
195
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
196 def pipe_vcftools_to_bed_file (vcftools_call_args, output_filename):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
197
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
198 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
199 Pipes site-file output of vcftools to a bed formmated file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
200
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
201 The purpose of this function is to avoid creating large uncompressed
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
202 vcf files by directly piping the output of vcftools to bgzip. This
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
203 results in creating a vcf.gz file without any intermediates.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
204
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
205 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
206 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
207 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
208 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
209 output_filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
210 Filename of the bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
211
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
212 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
213 # Open vcftools pipe
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
214 vcftools_call = pipe_vcftools(vcftools_call_args)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
215
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
216 # Create the bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
217 bed_output = open(output_filename, 'w')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
218
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
219 try:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
220 # Iterate the vcftools stdout unless error occurs
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
221 for vcftools_stdout_line in iter(vcftools_call.stdout.readline, b''):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
222 bed_output.write(cvt_vcftools_site_to_bed(vcftools_stdout_line))
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
223 # Close the bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
224 bed_output.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
225 except:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
226 # Close the bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
227 bed_output.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
228 # Delete the file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
229 os.remove(output_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
230
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
231 # Wait for vctools to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
232 vcftools_call.wait()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
233
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
234 # Close the vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
235 vcftools_call.stdout.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
236
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
237 # Read the vcftools stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
238 vcftools_stderr = vcftools_call.stderr.read()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
239
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
240 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
241 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
242 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
243 vcftools_stderr = vcftools_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
244
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
245 # Check that the log file was created correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
246 check_vcftools_for_errors(vcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
247
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
248 logging.info('vcftools call complete')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
249
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
250 return vcftools_stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
251
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
252 def pipe_vcftools_bgzip (vcftools_call_args, output_filename):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
253 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
254 Pipes the output of vcftools to bgzip
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
255
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
256 The purpose of this function is to avoid creating large uncompressed
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
257 vcf files by directly piping the output of vcftools to bgzip. This
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
258 results in creating a vcf.gz file without any intermediates.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
259
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
260 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
261 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
262 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
263 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
264 output_filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
265 Filename of the compressed vcf file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
266
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
267 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
268
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
269 vcftools_call = pipe_vcftools(vcftools_call_args)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
270
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
271 # Create bgzip output file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
272 bgzip_output = open(output_filename, 'wb')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
273
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
274 # bgzip subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
275 bgzip_call = subprocess.Popen(['bgzip'], stdin = vcftools_call.stdout, stdout = bgzip_output, stderr = subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
276
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
277 # Wait for vctools to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
278 vcftools_call.wait()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
279
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
280 # Close the vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
281 vcftools_call.stdout.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
282
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
283 # Read the vcftools stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
284 vcftools_stderr = vcftools_call.stderr.read()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
285
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
286 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
287 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
288 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
289 vcftools_stderr = vcftools_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
290
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
291 # Check that the log file was created correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
292 check_vcftools_for_errors(vcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
293
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
294 # Wait for bgzip to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
295 bgzip_call.wait()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
296
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
297 # Close the compressed vcf file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
298 bgzip_output.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
299
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
300 # Save the stderr from bgzip, stdout = None
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
301 bgzip_stdout, bgzip_stderr = bgzip_call.communicate()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
302
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
303 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
304 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
305 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
306 bgzip_stderr = bgzip_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
307
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
308 # Check that output file was compressed correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
309 check_bgzip_for_errors(bgzip_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
310
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
311 logging.info('vcftools and bgzip calls complete')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
312
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
313 return vcftools_stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
314
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
315 def pipe_vcftools_bcftools (vcftools_call_args, output_filename):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
316 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
317 Pipes the output of vcftools to bcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
318
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
319 The purpose of this function is to avoid the vcftools command
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
320 --recode-bcf that may result in malformed BCF files. To avoid large
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
321 uncompressed intermediates, this function pipes the stdout of vcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
322 to bcftools.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
323
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
324 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
325 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
326 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
327 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
328 output_filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
329 Filename of the BCF file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
330
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
331 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
332
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
333 vcftools_call = pipe_vcftools(vcftools_call_args)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
334
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
335 # Holds the arguments to convert to BCF format
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
336 convert_args = ['view', '-O', 'b']
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
337
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
338 # Assigns the output file to the arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
339 convert_args.extend(['-o', output_filename])
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
340
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
341 # bcftools subprocess call
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
342 bcftools_call = subprocess.Popen(['bcftools'] + convert_args, stdin = vcftools_call.stdout, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
343
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
344 # Wait for vctools to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
345 vcftools_call.wait()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
346
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
347 # Close the vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
348 vcftools_call.stdout.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
349
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
350 # Read the vcftools stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
351 vcftools_stderr = vcftools_call.stderr.read()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
352
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
353 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
354 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
355 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
356 vcftools_stderr = vcftools_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
357
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
358 # Check that the log file was created correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
359 check_vcftools_for_errors(vcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
360
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
361 # Wait for bgzip to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
362 bcftools_call.wait()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
363
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
364 # Save the stderr from bgzip, stdout = None
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
365 bcftools_stdout, bcftools_stderr = bcftools_call.communicate()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
366
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
367 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
368 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
369 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
370 bcftools_stderr = bcftools_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
371
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
372 # Check that output file was compressed correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
373 check_bcftools_for_errors(bcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
374
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
375 logging.info('vcftools and bcftools calls complete')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
376
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
377 return vcftools_stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
378
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
379 def pipe_vcftools_to_file (vcftools_call_args, output_filename, append_output = False):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
380 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
381 Pipes file output of vcftools to a standard file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
382
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
383 The function calls vcftools. Returns the stderr of vcftools to
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
384 create log file of the call. The function may be used to append multiple
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
385 calls to vcftools to a single file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
386
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
387 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
388 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
389 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
390 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
391 append_output : bool
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
392 The output file should be written in append mode
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
393
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
394 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
395 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
396 vcftools_err : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
397 vcftools log output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
398
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
399 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
400 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
401 Exception
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
402 If vcftools stderr returns an error
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
403 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
404
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
405 # Open vcftools pipe
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
406 vcftools_call = pipe_vcftools(vcftools_call_args)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
407
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
408 # Check if the output should be opened in append mode
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
409 if append_output:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
410 # Create the output file (in append mode)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
411 output_file = open(output_filename, 'a')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
412 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
413 # Create the output file (in write mode)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
414 output_file = open(output_filename, 'w')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
415
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
416
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
417 try:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
418 # Create iterator of the vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
419 stdout_iter = iter(vcftools_call.stdout.readline, b'')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
420
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
421 # Check if the output is being appended and the file is empty
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
422 if append_output and os.stat(output_filename).st_size != 0:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
423 # Skip the header if the file isn't empty and appending
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
424 next(stdout_iter)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
425
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
426 # Iterate the vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
427 for vcftools_stdout_line in stdout_iter:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
428
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
429 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
430 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
431 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
432 vcftools_stdout_line = vcftools_stdout_line.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
433
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
434 output_file.write(vcftools_stdout_line)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
435
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
436 # Close the bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
437 output_file.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
438
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
439 except:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
440 # Close the bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
441 output_file.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
442 # Delete the file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
443 os.remove(output_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
444
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
445 raise Exception('vcftools to python pipe error')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
446
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
447 # Wait for vctools to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
448 vcftools_call.wait()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
449
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
450 # Close the vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
451 vcftools_call.stdout.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
452
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
453 # Read the vcftools stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
454 vcftools_stderr = vcftools_call.stderr.read()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
455
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
456 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
457 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
458 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
459 vcftools_stderr = vcftools_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
460
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
461 # Check that the log file was created correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
462 check_vcftools_for_errors(vcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
463
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
464 logging.info('vcftools call complete')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
465
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
466 return vcftools_stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
467
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
468 def standard_vcftools_call (vcftools_call_args):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
469 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
470 Calls vcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
471
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
472 The function calls vcftools. Returns the stderr of vcftools to
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
473 create log file of the call.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
474
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
475 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
476 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
477 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
478 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
479
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
480 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
481 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
482 vcftools_out : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
483 vcftools call output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
484 vcftools_err : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
485 vcftools log output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
486
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
487 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
488 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
489 Exception
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
490 If vcftools stderr returns an error
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
491 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
492
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
493 # vcftools subprocess call without stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
494 vcftools_call = subprocess.Popen(['vcftools'] + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
495
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
496 # Wait for vcftools to finish
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
497 vcftools_stdout, vcftools_stderr = vcftools_call.communicate()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
498
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
499 # Check if code is running in python 3
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
500 if sys.version_info[0] == 3:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
501 # Convert bytes to string
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
502 vcftools_stderr = vcftools_stderr.decode()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
503
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
504 logging.info('vcftools call complete')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
505
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
506 # Check that the log file was created correctly
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
507 check_vcftools_for_errors(vcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
508
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
509 return vcftools_stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
510
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
511 def call_vcftools (vcftools_call_args, output_format = None, output_filename = None):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
512 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
513 Calls vcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
514
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
515 The function calls vcftools. Returns the stderr of vcftools to
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
516 create log file of the call.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
517
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
518 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
519 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
520 vcftools_call_args : list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
521 vcftools arguments
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
522 output_format : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
523 The output format
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
524 output_filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
525 The output filename assigned by vcftools (for piped calls)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
526
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
527 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
528 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
529 vcftools_out : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
530 vcftools call output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
531 vcftools_err : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
532 vcftools log output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
533
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
534 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
535 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
536 Exception
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
537 If vcftools stderr returns an error
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
538 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
539
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
540 # Check if the output is a bgzipped vcf
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
541 if output_format == 'vcf.gz':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
542 # Pipe vcftools stdout to bgzip to create a bgzipped vcf
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
543 vcftools_err = pipe_vcftools_bgzip(vcftools_call_args, output_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
544 # Check if the output is a bcf
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
545 elif output_format == 'bcf':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
546 # Pipe vcftools stdout to bgzip to create a bgzipped vcf
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
547 vcftools_err = pipe_vcftools_bcftools(vcftools_call_args, output_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
548 elif output_format == 'removed_bed' or output_format == 'kept_bed':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
549 # Pipe vcftools stdout to bed file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
550 vcftools_err = pipe_vcftools_to_bed_file(vcftools_call_args, output_filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
551 elif output_format == 'het-fis':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
552 vcftools_err = pipe_vcftools_to_file(vcftools_call_args, output_filename, append_output = True)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
553 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
554 # Call vcftools under standard conditions
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
555 vcftools_err = standard_vcftools_call(vcftools_call_args)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
556
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
557 # Return the log
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
558 return vcftools_err
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
559
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
560 def check_for_vcftools_output (vcftools_output):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
561 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
562 Checks for the previous vcftools output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
563
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
564 Confirms that neither a previous vcftools log or output file exists.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
565
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
566 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
567 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
568 vcftools_output : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
569 Specifies the output filename to be checked
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
570
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
571 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
572 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
573 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
574 If the vcftools output file exists
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
575 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
576 If the vcftools log file exists
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
577
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
578 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
579 # Check if output file already exists
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
580 if os.path.isfile(vcftools_output):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
581 raise IOError('VCF output file already exists')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
582
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
583 logging.info('Output file assigned')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
584
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
585 # Check if log file already exists
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
586 if os.path.isfile(vcftools_output + '.log'):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
587 raise IOError('Log file already exists')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
588
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
589 logging.info('Log file assigned')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
590
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
591 def delete_vcftools_output (vcftools_output):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
592 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
593 Deletes previous vcftools output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
594
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
595 Confirms if previous vcftools output exists, and if so, deletes it
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
596
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
597 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
598 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
599 vcftools_output : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
600 Specifies the output filename to be deleted
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
601
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
602 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
603 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
604 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
605 If the vcftools output cannot be deleted
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
606 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
607 If the vcftools log cannot be deleted
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
608 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
609
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
610 # Check if output file already exists
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
611 if os.path.isfile(vcftools_output):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
612 try:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
613 # Delete the output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
614 os.remove(vcftools_output)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
615 except:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
616 raise IOError('VCF output file cannot be deleted')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
617
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
618 logging.info('Output file assigned')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
619
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
620 # Check if log file already exists
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
621 if os.path.isfile(vcftools_output + '.log'):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
622 try:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
623 # Delete the output
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
624 os.remove(vcftools_output + '.log')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
625 except:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
626 raise IOError('Log file cannot be deleted')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
627
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
628 logging.info('Log file assigned')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
629
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
630 def check_vcftools_for_errors (vcftools_stderr):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
631 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
632 Checks the vcftools stderr for errors
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
633
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
634 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
635 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
636 vcftools_stderr : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
637 vcftools stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
638
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
639 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
640 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
641 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
642 If vcftools stderr returns an error
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
643 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
644
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
645 # Returns True if the job completed without error
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
646 if 'Run Time' in str(vcftools_stderr):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
647 pass
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
648
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
649 # Print output for vcftools if error is detected
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
650 elif 'Error' in str(vcftools_stderr):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
651 # Splits log into list of lines
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
652 vcftools_stderr_lines = vcftools_stderr.splitlines()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
653 # Prints the error(s)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
654 raise Exception('\n'.join((output_line for output_line in vcftools_stderr_lines if output_line.startswith('Error'))))
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
655
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
656 # Print output if not completed and no error found. Unlikely to be used, but included.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
657 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
658 raise Exception(vcftools_stderr)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
659
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
660 def produce_vcftools_output (output, filename, append_mode = False, strip_header = False):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
661 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
662 Creates the vcftools output file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
663
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
664 This function will create an output file from the vcftools stdout.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
665 Please run `check_vcftools_for_errors` prior to check that vcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
666 finished without error.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
667
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
668 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
669 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
670 output : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
671 vcftools stdout
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
672 filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
673 Specifies the filename for the output file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
674 append_mode : bool
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
675 Used to create a single output file from multiple calls
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
676 strip_header : bool
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
677 Used to remove the header if not needed
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
678
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
679 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
680 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
681 output : file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
682 vcftools output file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
683
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
684 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
685
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
686 # Check if the header should be stripped
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
687 if strip_header:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
688 output = ''.join(output.splitlines(True)[1:])
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
689
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
690 # Check if single log file is required from multiple calls
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
691 if append_mode:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
692 vcftools_log_file = open(filename,'a')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
693 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
694 vcftools_log_file = open(filename,'w')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
695
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
696 vcftools_log_file.write(str(output))
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
697 vcftools_log_file.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
698
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
699 def produce_vcftools_log (output, filename, append_mode = False):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
700 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
701 Creates the vcftools log file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
702
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
703 This function will create a log file from the vcftools stderr. Please
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
704 run `check_vcftools_for_errors` prior to check that vcftools finished
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
705 without error.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
706
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
707 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
708 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
709 output : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
710 vcftools stderr
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
711 filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
712 Specifies the filename for the log file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
713 append_mode : bool
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
714 Used to create a single log file from multiple calls
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
715
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
716 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
717 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
718 output : file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
719 vcftools log file
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
720
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
721 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
722 # Check if single log file is required from multiple calls
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
723 if append_mode:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
724 vcftools_log_file = open(filename + '.log','a')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
725 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
726 vcftools_log_file = open(filename + '.log','w')
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
727
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
728 vcftools_log_file.write(str(output))
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
729 vcftools_log_file.close()
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
730
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
731 def assign_vcftools_input_arg (filename):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
732 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
733 Confirms file format for vcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
734
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
735 Parameters
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
736 ----------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
737 filename : str
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
738 Specifies the input filename of unknown format
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
739
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
740 Returns
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
741 -------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
742 list
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
743 Returns vcftools input command for `filename`
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
744
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
745 Raises
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
746 ------
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
747 IOError
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
748 If filename is an unknown file format
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
749 '''
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
750
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
751 # True if file extensions is recognized by vcftools
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
752 if filename.endswith('.vcf') or filename.endswith('.vcf.gz') or filename.endswith('.bcf'):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
753 # Assign the associated input command
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
754 if filename.endswith('.vcf'):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
755 return ['--vcf', filename]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
756 elif filename.endswith('.vcf.gz'):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
757 return ['--gzvcf', filename]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
758 elif filename.endswith('.bcf'):
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
759 return ['--bcf', filename]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
760
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
761 # True if file extension is unknown or not recognized
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
762 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
763
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
764 # Checks if the file is unzipped, bgzipped, or gzipped
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
765 vcfname_format = checkFormat(filename)
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
766
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
767 # Assign the associated input command, or return an error.
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
768 if vcfname_format == 'vcf':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
769 return ['--vcf', filename]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
770 elif vcfname_format == 'bgzip':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
771 return ['--gzvcf', filename]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
772 elif vcfname_format == 'bcf':
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
773 return ['--bcf', filename]
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
774 else:
d1e3db7f6521 Uploaded
jaredgk
parents:
diff changeset
775 raise Exception('Unknown VCF file format')