annotate shapeit.py @ 5:86a9d8d5b291 draft default tip

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:34:34 -0400
parents 3830d29fca6a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
1 import os
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
2 import sys
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
3 import subprocess
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
4 import shutil
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
5 import argparse
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
6 import glob
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
7 import logging
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
8
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
9 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
10
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
11 from vcf_reader_func import checkFormat
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
12 from logging_module import initLogger, logArgs
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
13 from plink import convert_haps_to_vcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
14 #from vcftools import bgzip_decompress_vcfgz
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
15 #from bcftools import convert_to_bcf, check_for_index, create_index
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
16
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
17 def check_shapeit_for_errors (shapeit_stdout, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
18 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
19 Checks the shapeit stdout for errors
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
20
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
21 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
22 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
23 shapeit_stdout : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
24 shapeit stdout
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
25 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
26 Output filename prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
27
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
28 Raises
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
29 ------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
30 Exception
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
31 If shapeit stdout returns an error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
32 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
33
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
34 # Returns True if the job completed without error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
35 if 'Running time:' in str(shapeit_stdout):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
36 pass
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
37
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
38 # Print output if not completed and no error found. Unlikely to be used, but included.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
39 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
40 # Remove intermediate files before reporting the error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
41 remove_intermediate_files(output_prefix, error_intermediates = True)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
42 raise Exception(str(shapeit_stdout))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
43
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
44 def remove_intermediate_files (output_prefix, error_intermediates = False):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
45 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
46 Removes shapeit intermediate files
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
47
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
48 This function is used to remove the various intermediate files created
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
49 by shapeit. The exact intermediate files to be removed are defined by
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
50 the error-state of shapeit. The function will also return warnings if
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
51 the intermediate files were not found.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
52
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
53 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
54 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
55 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
56 Output filename prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
57 error_intermediates : bool, optional
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
58 Defines if shapeit encountered an error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
59
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
60 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
61 if error_intermediates:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
62
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
63 # Check that the log file was created, give a warning otherwise
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
64 if not os.path.isfile(output_prefix + '.phase.log'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
65 logging.warning('shapeit intermediate file %s.phase.log does not exist' % output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
66 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
67 # Remove shapeit log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
68 os.remove(output_prefix + '.phase.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
69
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
70 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
71
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
72 # Check that the phase.ind.mm file was created, give a warning otherwise
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
73 if not os.path.isfile(output_prefix + '.phase.ind.mm'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
74 logging.warning('shapeit intermediate file %s.phase.ind.mm does not exist' % output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
75 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
76 # Remove shapeit phase.ind.mm file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
77 os.remove(output_prefix + '.phase.ind.mm')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
78
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
79 # Check that the phase.snp.mm file was created, give a warning otherwise
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
80 if not os.path.isfile(output_prefix + '.phase.snp.mm'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
81 logging.warning('shapeit intermediate file %s.phase.snp.mm does not exist' % output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
82 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
83 # Remove shapeit phase.snp.mm file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
84 os.remove(output_prefix + '.phase.snp.mm')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
85
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
86 # Check that the haps file was created, give a warning otherwise
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
87 if not os.path.isfile(output_prefix + '.haps'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
88 logging.warning('shapeit intermediate file %s.haps does not exist' % output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
89 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
90 # Remove shapeit haps file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
91 os.remove(output_prefix + '.haps')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
92
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
93 # Check that the sample file was created, give a warning otherwise
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
94 if not os.path.isfile(output_prefix + '.sample'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
95 logging.warning('shapeit intermediate file %s.sample does not exist' % output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
96 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
97 # Remove shapeit sample file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
98 os.remove(output_prefix + '.sample')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
99
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
100 logging.info('shapeit-related files removed')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
101
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
102 def standard_shapeit_call (shapeit_call_args, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
103 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
104 Calls shapeit using subprocess
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
105
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
106 This function is used to call shapeit and passes the resulting stdout
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
107 to check_shapeit_for_errors to check for errors. The function also
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
108 passes output_prefix to check_shapeit_for_errors to delete shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
109 intermediate files if shapeit results in an error.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
110
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
111 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
112 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
113 shapeit_call_args : list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
114 Argument list for shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
115 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
116 Output filename prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
117
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
118 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
119
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
120 logging.info('shapeit phasing parameters assigned')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
121
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
122 # Phasing subprocess call
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
123 phase_call = subprocess.Popen(['shapeit'] + shapeit_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
124 phase_stdout, phase_stderr = phase_call.communicate()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
125
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
126 # Check if code is running in python 3
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
127 if sys.version_info[0] == 3:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
128 # Convert bytes to string
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
129 phase_stdout = phase_stdout.decode()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
130
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
131 # Check shapeit call for errors
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
132 check_shapeit_for_errors(phase_stdout, output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
133
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
134 logging.info('shapeit phasing complete (HAPS format)')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
135
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
136 def call_shapeit (shapeit_call_args, output_prefix, output_format):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
137 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
138 Calls shapeit and automates file conversions
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
139
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
140 The function is used to call shapeit and also automates conversion to
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
141 VCF, VCF.GZ, and BCF using plink2
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
142
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
143 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
144 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
145 shapeit_call_args : list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
146 Argument list for shapeit
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
147 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
148 Output filename prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
149 output_format : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
150 Output file format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
151
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
152 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
153
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
154 # Standard call to beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
155 standard_shapeit_call(shapeit_call_args, output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
156
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
157 # Convert haps-format to vcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
158 convert_haps_to_vcf(output_prefix, output_format)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
159
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
160 logging.info('HAPS conversion to VCF complete')