comparison tools/coverage_stats/coverage_stats.py @ 2:7254ece0c0ff draft

v0.0.5 - Supports max coverage depth in recent samtools. Expects samtools 1.4.1 via Conda, not via Tool Shed.
author peterjc
date Thu, 11 May 2017 12:16:10 -0400
parents ca8f63f2f7d4
children
comparison
equal deleted inserted replaced
1:d1fdfaae5dbe 2:7254ece0c0ff
4 This script takes exactly three command line arguments: 4 This script takes exactly three command line arguments:
5 * Input BAM filename 5 * Input BAM filename
6 * Input BAI filename (via Galaxy metadata) 6 * Input BAI filename (via Galaxy metadata)
7 * Output tabular filename 7 * Output tabular filename
8 8
9 Optional fourth argument:
10 * Max coverage depth (integer)
11
9 This messes about with the filenames to make samtools happy, then 12 This messes about with the filenames to make samtools happy, then
10 runs "samtools idxstats" and "samtools depth", captures and combines 13 runs "samtools idxstats" and "samtools depth", captures and combines
11 the output to the desired output tabular file. 14 the output to the desired output tabular file.
15
16 Because "samtools depth" treats the max depth a little fuzzily, this
17 tool tries to account for this and applies a clear max-depth cut off.
12 """ 18 """
13 import sys 19
14 import os 20 import os
15 import subprocess 21 import subprocess
22 import sys
16 import tempfile 23 import tempfile
17 24
18 if "-v" in sys.argv or "--version" in sys.argv: 25 if "-v" in sys.argv or "--version" in sys.argv:
19 #Galaxy seems to invert the order of the two lines 26 # Galaxy seems to invert the order of the two lines
20 print("BAM coverage statistics v0.0.1") 27 print("BAM coverage statistics v0.0.5")
21 cmd = "samtools 2>&1 | grep -i ^Version" 28 cmd = "samtools 2>&1 | grep -i ^Version"
22 sys.exit(os.system(cmd)) 29 sys.exit(os.system(cmd))
23 30
24 def stop_err(msg, error_level=1): 31 # TODO - Proper command line API
25 """Print error message to stdout and quit with given error level.""" 32 if len(sys.argv) == 4:
26 sys.stderr.write("%s\n" % msg) 33 bam_filename, bai_filename, tabular_filename = sys.argv[1:]
27 sys.exit(error_level) 34 max_depth = "8000"
28 35 elif len(sys.argv) == 5:
29 if len(sys.argv) != 4: 36 bam_filename, bai_filename, tabular_filename, max_depth = sys.argv[1:]
30 stop_err("Require three arguments: BAM, BAI, tabular filenames") 37 else:
31 38 sys.exit("Require 3 or 4 arguments: BAM, BAI, tabular filename, [max depth]")
32 bam_filename, bai_filename, tabular_filename = sys.argv[1:]
33 39
34 if not os.path.isfile(bam_filename): 40 if not os.path.isfile(bam_filename):
35 stop_err("Input BAM file not found: %s" % bam_filename) 41 sys.exit("Input BAM file not found: %s" % bam_filename)
42 if bai_filename == "-":
43 # Make this optional for ease of use at the command line by hand:
44 if os.path.isfile(bam_filename + ".bai"):
45 bai_filename = bam_filename + ".bai"
36 if not os.path.isfile(bai_filename): 46 if not os.path.isfile(bai_filename):
37 if bai_filename == "None": 47 if bai_filename == "None":
38 stop_err("Error: Galaxy did not index your BAM file") 48 sys.exit("Error: Galaxy did not index your BAM file")
39 stop_err("Input BAI file not found: %s" % bai_filename) 49 sys.exit("Input BAI file not found: %s" % bai_filename)
40 50
41 #Assign sensible names with real extensions, and setup symlinks: 51 try:
52 max_depth = int(max_depth)
53 except ValueError:
54 sys.exit("Bad argument for max depth: %r" % max_depth)
55 if max_depth < 0:
56 sys.exit("Bad argument for max depth: %r" % max_depth)
57
58 # fuzz factor to ensure can reach max_depth, e.g. region with
59 # many reads having a deletion present could underestimate the
60 # coverage by capping the number of reads considered
61 depth_margin = 100
62
63 # Assign sensible names with real extensions, and setup symlinks:
42 tmp_dir = tempfile.mkdtemp() 64 tmp_dir = tempfile.mkdtemp()
43 bam_file = os.path.join(tmp_dir, "temp.bam") 65 bam_file = os.path.join(tmp_dir, "temp.bam")
44 bai_file = os.path.join(tmp_dir, "temp.bam.bai") 66 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
45 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv") 67 idxstats_filename = os.path.join(tmp_dir, "idxstats.tsv")
46 depth_filename = os.path.join(tmp_dir, "depth.tsv") 68 depth_filename = os.path.join(tmp_dir, "depth.tsv")
48 os.symlink(os.path.abspath(bai_filename), bai_file) 70 os.symlink(os.path.abspath(bai_filename), bai_file)
49 assert os.path.isfile(bam_file), bam_file 71 assert os.path.isfile(bam_file), bam_file
50 assert os.path.isfile(bai_file), bai_file 72 assert os.path.isfile(bai_file), bai_file
51 assert os.path.isfile(bam_file + ".bai"), bam_file 73 assert os.path.isfile(bam_file + ".bai"), bam_file
52 74
75
53 def clean_up(): 76 def clean_up():
77 """Remove our temporary files and directory."""
54 os.remove(idxstats_filename) 78 os.remove(idxstats_filename)
55 os.remove(depth_filename) 79 os.remove(depth_filename)
56 os.remove(bam_file) 80 os.remove(bam_file)
57 os.remove(bai_file) 81 os.remove(bai_file)
58 os.rmdir(tmp_dir) 82 os.rmdir(tmp_dir)
59 83
84
85 def samtools_depth_opt_available():
86 """Determine if samtools depth supports maximum coverage argument."""
87 child = subprocess.Popen(["samtools", "depth"],
88 stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
89 # Combined stdout/stderr in case samtools is ever inconsistent
90 output, tmp = child.communicate()
91 assert tmp is None
92 # Expect to find this line in the help text, exact wording could change:
93 # -d/-m <int> maximum coverage depth [8000]
94 return " -d/-m " in output
95
96
97 depth_hack = False
98 if not samtools_depth_opt_available():
99 if max_depth + depth_margin <= 8000:
100 sys.stderr.write("WARNING: The version of samtools depth installed does not "
101 "support the -d option, however, the requested max-depth "
102 "is safely under the default of 8000.\n")
103 depth_hack = True
104 else:
105 sys.exit("The version of samtools depth installed does not support the -d option.")
106
60 # Run samtools idxstats: 107 # Run samtools idxstats:
61 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename) 108 cmd = 'samtools idxstats "%s" > "%s"' % (bam_file, idxstats_filename)
62 return_code = os.system(cmd) 109 return_code = os.system(cmd)
63 if return_code: 110 if return_code:
64 clean_up() 111 clean_up()
65 stop_err("Return code %i from command:\n%s" % (return_code, cmd)) 112 sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
66 113
67 # Run samtools depth: 114 # Run samtools depth:
68 # TODO - Parse stdout instead? 115 # TODO - Parse stdout instead?
69 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename) 116 if depth_hack:
117 # Using an old samtools without the -d option, but hard coded default
118 # of 8000 should be fine even allowing a margin for fuzzy output
119 cmd = 'samtools depth "%s" > "%s"' % (bam_file, depth_filename)
120 else:
121 cmd = 'samtools depth -d %i "%s" > "%s"' % (max_depth + depth_margin, bam_file, depth_filename)
70 return_code = os.system(cmd) 122 return_code = os.system(cmd)
71 if return_code: 123 if return_code:
72 clean_up() 124 clean_up()
73 stop_err("Return code %i from command:\n%s" % (return_code, cmd)) 125 sys.exit("Return code %i from command:\n%s" % (return_code, cmd))
126
74 127
75 def load_total_coverage(depth_handle, identifier, length): 128 def load_total_coverage(depth_handle, identifier, length):
76 """Parse some of the 'samtools depth' output for coverages. 129 """Parse some of the 'samtools depth' output for coverages.
77 130
78 Returns min_cov (int), max_cov (int) and mean cov (float). 131 Returns min_cov (int), max_cov (int) and mean cov (float).
84 137
85 # print("====") 138 # print("====")
86 # print("%s coverage calculation, length %i, ..." % (identifier, length)) 139 # print("%s coverage calculation, length %i, ..." % (identifier, length))
87 140
88 if depth_ref is None: 141 if depth_ref is None:
89 # Right at start of file! 142 # Right at start of file / new contig
90 line = depth_handle.readline() 143 line = depth_handle.readline()
144 # Are we at the end of the file?
145 if not line:
146 # Must be at the end of the file.
147 # This can happen if the file contig(s) had no reads mapped
148 return 0, 0, 0.0, 0
91 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split() 149 depth_ref, depth_pos, depth_reads = line.rstrip("\n").split()
92 depth_pos = int(depth_pos) 150 depth_pos = int(depth_pos)
93 depth_reads = int(depth_reads) 151 depth_reads = min(max_depth, int(depth_reads))
94 # Can now treat as later references where first line cached 152 # Can now treat as later references where first line cached
95 elif identifier != depth_ref: 153 elif identifier != depth_ref:
96 # Infer that identifier had coverage zero, 154 # Infer that identifier had coverage zero,
97 # and so was not in the 'samtools depth' 155 # and so was not in the 'samtools depth'
98 # output. 156 # output.
99 # print("%s appears to have no coverage at all" % identifier) 157 # print("%s appears to have no coverage at all" % identifier)
100 return 0, 0, 0.0 158 return 0, 0, 0.0, 0
101 159
102 # Good, at start of expected reference 160 # Good, at start of expected reference
103 bases = depth_reads 161 bases = depth_reads
104 if depth_pos == 1: 162 if depth_pos == 1:
105 min_cov = depth_reads 163 min_cov = depth_reads
113 depth_pos = 0 171 depth_pos = 0
114 depth_reads = 0 172 depth_reads = 0
115 for line in depth_handle: 173 for line in depth_handle:
116 ref, pos, depth = line.rstrip("\n").split() 174 ref, pos, depth = line.rstrip("\n").split()
117 pos = int(pos) 175 pos = int(pos)
118 depth = int(depth) 176 depth = min(max_depth, int(depth))
119 if ref != identifier: 177 if ref != identifier:
120 # Reached the end of this identifier's coverage 178 # Reached the end of this identifier's coverage
121 # so cache this ready for next identifier 179 # so cache this ready for next identifier
122 depth_ref, depth_pos, depth_reads = ref, pos, depth 180 depth_ref, depth_pos, depth_reads = ref, pos, depth
123 break 181 break
133 # Reach the end of this identifier's coverage or end of file 191 # Reach the end of this identifier's coverage or end of file
134 if last_pos < length: 192 if last_pos < length:
135 # print("%s has no coverage at end" % identifier) 193 # print("%s has no coverage at end" % identifier)
136 min_cov = 0 194 min_cov = 0
137 mean_cov = bases / float(length) 195 mean_cov = bases / float(length)
138 return min_cov, max_cov, mean_cov 196 return min_cov, max_cov, mean_cov, bases
197
139 198
140 # Parse and combine the output 199 # Parse and combine the output
141 out_handle = open(tabular_filename, "w") 200 out_handle = open(tabular_filename, "w")
142 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n") 201 out_handle.write("#identifer\tlength\tmapped\tplaced\tmin_cov\tmax_cov\tmean_cov\n")
143 202
146 205
147 depth_ref = None 206 depth_ref = None
148 depth_pos = 0 207 depth_pos = 0
149 depth_reads = 0 208 depth_reads = 0
150 209
210 global_bases = 0
211 global_length = 0
151 for line in idxstats_handle: 212 for line in idxstats_handle:
152 identifier, length, mapped, placed = line.rstrip("\n").split() 213 identifier, length, mapped, placed = line.rstrip("\n").split()
153 length = int(length) 214 length = int(length)
154 mapped = int(mapped) 215 mapped = int(mapped)
155 placed = int(placed) 216 placed = int(placed)
156 if identifier == "*": 217 if identifier == "*":
157 min_cov = 0 218 min_cov = 0
158 max_cov = 0 219 max_cov = 0
159 mean_cov = 0.0 220 mean_cov = 0.0
221 bases = 0
160 else: 222 else:
161 min_cov, max_cov, mean_cov = load_total_coverage(depth_handle, identifier, length) 223 min_cov, max_cov, mean_cov, bases = load_total_coverage(depth_handle, identifier, length)
224 if max_cov > max_depth:
225 sys.exit("Using max depth %i yet saw max coverage %i for %s"
226 % (max_depth, max_cov, identifier))
162 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n" 227 out_handle.write("%s\t%i\t%i\t%i\t%i\t%i\t%0.2f\n"
163 % (identifier, length, mapped, placed, 228 % (identifier, length, mapped, placed,
164 min_cov, max_cov, mean_cov)) 229 min_cov, max_cov, mean_cov))
165 if not (min_cov <= mean_cov <= max_cov): 230 if not (min_cov <= mean_cov <= max_cov):
166 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n") 231 out_handle.write("ERROR, expect min_cov <= mean_cov <= max_cov\n")
167 idxstats_handle.close() 232 idxstats_handle.close()
168 depth_handle.close() 233 depth_handle.close()
169 out_handle.close() 234 out_handle.close()
170 clean_up() 235 clean_up()
171 stop_err("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov" 236 sys.exit("Problem with coverage for %s, expect min_cov <= mean_cov <= max_cov"
172 % identifier) 237 % identifier)
238 global_length += length
239 global_bases += bases
173 240
174 idxstats_handle.close() 241 idxstats_handle.close()
175 depth_handle.close() 242 depth_handle.close()
176 out_handle.close() 243 out_handle.close()
177 244
245 print("Total reference length %i with overall mean coverage %0.2f" % (global_length, float(global_bases) / global_length))
246
178 # Remove the temp symlinks and files: 247 # Remove the temp symlinks and files:
179 clean_up() 248 clean_up()
180 249
181 if depth_ref is not None: 250 if depth_ref is not None:
182 stop_err("Left over output from 'samtools depth'? %r" % depth_ref) 251 sys.exit("Left over output from 'samtools depth'? %r" % depth_ref)