annotate bcftools.py @ 3:d1e3db7f6521 draft

Uploaded
author jaredgk
date Wed, 17 Oct 2018 17:28:38 -0400
parents 3830d29fca6a
children 86a9d8d5b291
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
1 import os
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
2 import sys
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
3 import logging
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
4 import subprocess
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
5
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
7
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
8 from vcf_reader_func import checkFormat
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
9
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
10 def check_bcftools_for_errors (bcftools_stderr):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
11 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
12 Checks the bgzip stderr for errors
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
13
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
14 Parameters
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
15 ----------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
16 bcftools_stderr : str
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
17 bcftools stderr
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
18
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
19 Raises
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
20 ------
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
21 IOError
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
22 If bcftools stderr returns an error
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
23 '''
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
24
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
25 # Expand as errors are discovered
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
26 if bcftools_stderr:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
27 logging.error(vcftools_stderr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
28 raise Exception(vcftools_stderr)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
29
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
30 def call_bcftools (bcftools_call_args):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
31
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
32 # bcftools subprocess call
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
33 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
34
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
35 # Wait for bcftools to finish
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
36 bcftools_out, bcftools_err = bcftools_call.communicate()
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
37
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
38 check_bcftools_for_errors(bcftools_err)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
39
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
40 logging.info('bcftools call complete')
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
41
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
42 def check_for_index (filename):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
43
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
44 # Assign the file format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
45 file_format = checkFormat(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
46
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
47 # Check if the file to be indexed is a vcf.gz
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
48 if file_format == 'bgzip':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
49 # Check if the index (.tbi) exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
50 if os.path.isfile(filename + '.tbi'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
51 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
52
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
53 # Check if the file to be indexed is a bcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
54 elif file_format == 'bcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
55 # Check if the index (.csi) exists
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
56 if os.path.isfile(filename + '.csi'):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
57 return True
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
58
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
59 # Return false if no index is found
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
60 return False
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
61
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
62 def create_index (filename):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
63
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
64 # Assign the file format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
65 file_format = checkFormat(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
66
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
67 # Check if the file to be indexed is a vcf.gz
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
68 if file_format == 'bgzip':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
69 # Create a index (.tbi)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
70 call_bcftools(['index', '-t', filename])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
71
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
72 # Check if the file to be indexed is a bcf
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
73 elif file_format == 'bcf':
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
74 # Create a index (.csi)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
75 call_bcftools(['index', '-c', filename])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
76
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
77 # Report if file cannot be indexed
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
78 else:
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
79 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
80
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
81 def convert_to_bcf (filename, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
82
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
83 # Holds the arguments to convert to BCF format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
84 convert_args = ['convert', '-O', 'b']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
85
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
86 # Stores the specified output_prefix to the BCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
87 bcf_output = '%s.bcf' % output_prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
88
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
89 # Assigns the output file to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
90 convert_args.extend(['-o', bcf_output])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
91
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
92 # Assigns the specified input to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
93 convert_args.append(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
94
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
95 # Call bcftools
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
96 call_bcftools(convert_args)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
97
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
98
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
99 def convert_to_vcf (filename, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
100
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
101 # Holds the arguments to convert to VCF format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
102 convert_args = ['view', '-O', 'v']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
103
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
104 # Stores the specified output_prefix to the VCF file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
105 vcf_output = '%s.vcf' % output_prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
106
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
107 # Assigns the output file to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
108 convert_args.extend(['-o', vcf_output])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
109
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
110 # Assigns the specified input to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
111 convert_args.append(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
112
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
113 # Call bcftools
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
114 call_bcftools(convert_args)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
115
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
116 def convert_to_vcfgz (filename, output_prefix):
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
117
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
118 # Holds the arguments to convert to VCFGZ format
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
119 convert_args = ['view', '-O', 'z']
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
120
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
121 # Stores the specified output_prefix to the VCFGZ file
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
122 vcfgz_output = '%s.vcf.gz' % output_prefix
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
123
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
124 # Assigns the output file to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
125 convert_args.extend(['-o', vcfgz_output])
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
126
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
127 # Assigns the specified input to the arguments
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
128 convert_args.append(filename)
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
129
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
130 # Call bcftools
3830d29fca6a Uploaded
jaredgk
parents:
diff changeset
131 call_bcftools(convert_args)