annotate beagle.py @ 5:86a9d8d5b291 draft default tip

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:34:34 -0400
parents 54c84f7dcb2c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
1 import os
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
2 import sys
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
3 import subprocess
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
4 import shutil
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
5 import argparse
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
6 import glob
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
7 import logging
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
8
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
9 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
10
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
11 from vcf_reader_func import checkFormat
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
12 from logging_module import initLogger, logArgs
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
13 from vcftools import bgzip_decompress_vcfgz
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
14 from bcftools import convert_to_bcf, check_for_index, create_index
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
15
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
16 def delete_beagle_log (output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
17 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
18 Delete beagle log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
19
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
20 This function is used to delete beagle's log file if an error is
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
21 encountered. A warning is produced if the log file cannot be found.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
22
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
23 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
24 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
25 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
26 Output file prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
27 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
28
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
29 # Check that log file exists, if not return warning
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
30 if not os.path.isfile(output_prefix + '.log'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
31 logging.warning('beagle log file %s.log does not exist' % output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
32 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
33 os.remove(output_prefix + '.log')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
34
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
35 def check_beagle_for_errors (beagle_stderr, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
36 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
37 Checks the beagle stdout for errors
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
38
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
39 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
40 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
41 beagle_stderr : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
42 beagle stderr
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
43 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
44 Output file prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
45
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
46 Raises
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
47 ------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
48 Exception
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
49 If beagle stdout returns an error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
50 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
51
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
52 # Check if beagle completed without an error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
53 if not beagle_stderr.strip():
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
54 pass
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
55
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
56 # Print missing data message if that is likely
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
57 elif 'ERROR: genotype is missing allele separator:' in str(beagle_stderr):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
58 # Delete the beagle log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
59 delete_beagle_log(output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
60
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
61 # Store reported error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
62 error_reported = 'ERROR: genotype is missing allele separator'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
63 # Store message for user about error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
64 user_message = 'Please confirm the input has no missing data.'
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
65 # Report on the error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
66 raise Exception(error_reported + '\n' + user_message)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
67
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
68 # Print output for beagle if error is detected
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
69 elif 'ERROR:' in str(beagle_stderr):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
70 # Delete the beagle log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
71 delete_beagle_log(output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
72
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
73 # Splits log into list of lines
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
74 beagle_stderr_lines = beagle_stderr.splitlines()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
75 # Prints the error(s)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
76 raise Exception('\n'.join((output_line for output_line in beagle_stderr_lines if output_line.startswith('ERROR:'))))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
77
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
78 # Print output if not completed and no error found. Unlikely to be used, but included.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
79 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
80 # Delete the beagle log file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
81 delete_beagle_log(output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
82
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
83 raise Exception(beagle_stderr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
84
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
85
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
86 def standard_beagle_call (beagle_path, beagle_call_args, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
87 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
88 Calls beagle using subprocess
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
89
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
90 This function is used to call beagle under standard conditions. The
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
91 functions then passes the stderr to check_beagle_for_errors to check
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
92 for errors.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
93
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
94 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
95 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
96 beagle_path : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
97 Path to beagle.jar
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
98 beagle_call_args : list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
99 Argument list for beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
100 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
101 Output file prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
102 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
103
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
104 # Assign location of beagle jar file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
105 beagle_jar = os.path.join(beagle_path, 'beagle.jar')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
106
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
107 # Check that beagle.jar exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
108 if not os.path.isfile(beagle_jar):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
109 raise IOError('beagle.jar not found. Path specified: %s' % beagle_path)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
110
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
111 logging.info('beagle phasing parameters assigned')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
112
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
113 # Phasing subprocess call
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
114 phase_call = subprocess.Popen(['java', '-jar', beagle_jar] + beagle_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
115 phase_stdout, phase_stderr = phase_call.communicate()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
116
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
117 # Check if code is running in python 3
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
118 if sys.version_info[0] == 3:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
119 # Convert bytes to string
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
120 phase_stderr = phase_stderr.decode()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
121
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
122 # Check beagle call for errors
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
123 check_beagle_for_errors(phase_stderr, output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
124
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
125 logging.info('beagle phasing complete')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
126
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
127 def call_beagle (beagle_path, beagle_call_args, output_prefix, output_format):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
128 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
129 Automates beagle calls
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
130
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
131 This function passes the argument list to standard_beagle_call. Once the
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
132 beagle call has finished, the function will automatically convert the
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
133 bgzip compressed output of beagle to BCF and VCF, if either format is
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
134 specified.
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
135
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
136 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
137 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
138 beagle_path : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
139 Path to beagle.jar
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
140 beagle_call_args : list
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
141 Argument list for beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
142 output_prefix : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
143 Output file prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
144 output_format : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
145 Output file format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
146 '''
2
54c84f7dcb2c Uploaded
jaredgk
parents: 0
diff changeset
147 print (beagle_call_args)
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
148 # Standard call to beagle
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
149 standard_beagle_call(beagle_path, beagle_call_args, output_prefix)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
150
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
151 # Decompress if a VCF files is requested
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
152 if output_format == 'vcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
153 bgzip_decompress_vcfgz(output_prefix + '.vcf.gz')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
154
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
155 # Convert to BCF if requested
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
156 elif output_format == 'bcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
157
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
158 # Check if there is an index file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
159 if check_for_index(output_prefix + '.vcf.gz') == False:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
160 # Create an index if not found
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
161 create_index(output_prefix + '.vcf.gz')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
162 # Convert vcf.gz to bcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
163 convert_to_bcf(output_prefix + '.vcf.gz', output_prefix)