0
|
1 #!/usr/bin/env python
|
|
2 """Count sequence variants in region of interest in a BAM file.
|
|
3
|
|
4 This script takes exactly four command line arguments:
|
|
5 * Input BAM filename
|
|
6 * Input BAI filename (via Galaxy metadata)
|
|
7 * Output tabular filename
|
|
8 * Region of interest (reference:start-end as used in samtools)
|
|
9
|
|
10 This messes about with the filenames to make samtools happy, then
|
|
11 runs "samtools view" and parses the reads mapped to the ROI, counts
|
|
12 the observed variants spanning the ROI, and outputs this as a
|
|
13 tabular file.
|
|
14 """
|
|
15 import sys
|
|
16 import os
|
|
17 import subprocess
|
|
18 import tempfile
|
|
19
|
|
20 if "-v" in sys.argv or "--version" in sys.argv:
|
|
21 # Galaxy seems to invert the order of the two lines
|
|
22 print("BAM coverage statistics v0.0.4 (using samtools)")
|
|
23 cmd = "samtools 2>&1 | grep -i ^Version"
|
|
24 sys.exit(os.system(cmd))
|
|
25
|
|
26 # TODO - Proper command line API
|
|
27 usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region
|
|
28
|
|
29 For ease of use, you can use a minus sign as the BAI filename which will use the BAM
|
|
30 filename with the suffix .bai added to it. Example using one of the test-data files:
|
|
31
|
|
32 $ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725"
|
|
33 Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725
|
|
34 $ cat counts.txt
|
|
35 Variant Count Percentage
|
|
36 AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50
|
|
37 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25
|
|
38 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25
|
|
39 """
|
|
40 if len(sys.argv) == 5:
|
|
41 bam_filename, bai_filename, tabular_filename, region = sys.argv[1:]
|
|
42 else:
|
|
43 sys.exit(usage)
|
|
44
|
|
45 if not os.path.isfile(bam_filename):
|
|
46 sys.exit("Input BAM file not found: %s" % bam_filename)
|
|
47 if bai_filename == "-":
|
|
48 # Make this optional for ease of use at the command line by hand:
|
|
49 if os.path.isfile(bam_filename + ".bai"):
|
|
50 bai_filename = bam_filename + ".bai"
|
|
51 if not os.path.isfile(bai_filename):
|
|
52 if bai_filename == "None":
|
|
53 sys.exit("Error: Galaxy did not index your BAM file")
|
|
54 sys.exit("Input BAI file not found: %s" % bai_filename)
|
|
55
|
|
56 try:
|
|
57 # Sanity check we have "ref:start-end" to give clear error message
|
|
58 # Note can have semi-colon in the reference name
|
|
59 # Note can have thousand separator commas in the start/end
|
|
60 ref, start_end = region.rsplit(":", 1)
|
|
61 start, end = start_end.split("-")
|
|
62 start = int(start.replace(",", ""))
|
|
63 end = int(end.replace(",", ""))
|
|
64 except ValueError:
|
|
65 sys.exit("Bad argument for region: %r" % region)
|
|
66
|
|
67
|
|
68 # Assign sensible names with real extensions, and setup symlinks:
|
|
69 tmp_dir = tempfile.mkdtemp()
|
|
70 bam_file = os.path.join(tmp_dir, "temp.bam")
|
|
71 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
|
|
72 os.symlink(os.path.abspath(bam_filename), bam_file)
|
|
73 os.symlink(os.path.abspath(bai_filename), bai_file)
|
|
74 assert os.path.isfile(bam_file), bam_file
|
|
75 assert os.path.isfile(bai_file), bai_file
|
|
76 assert os.path.isfile(bam_file + ".bai"), bam_file
|
|
77
|
|
78
|
|
79 def clean_up():
|
|
80 os.remove(bam_file)
|
|
81 os.remove(bai_file)
|
|
82 os.rmdir(tmp_dir)
|
|
83
|
|
84
|
|
85 def decode_cigar(cigar):
|
|
86 """Returns a list of 2-tuples, integer count and operator char."""
|
|
87 count = ""
|
|
88 answer = []
|
|
89 for letter in cigar:
|
|
90 if letter.isdigit():
|
|
91 count += letter # string addition
|
|
92 elif letter in "MIDNSHP=X":
|
|
93 answer.append((int(count), letter))
|
|
94 count = ""
|
|
95 else:
|
|
96 raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
|
|
97 return answer
|
|
98
|
|
99
|
|
100 assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')]
|
|
101
|
|
102
|
|
103 def align_len(cigar_ops):
|
|
104 """Sums the CIGAR M/=/X/D/N operators."""
|
|
105 return sum(count for count, op in cigar_ops if op in "M=XDN")
|
|
106
|
|
107
|
|
108 def expand_cigar(seq, cigar_ops):
|
|
109 """Yields (ref_offset, seq_base) pairs."""
|
|
110 ref_offset = 0
|
|
111 seq_offset = 0
|
|
112 for count, op in cigar_ops:
|
|
113 if op in "MX=":
|
|
114 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
|
|
115 yield ref_offset + i, base
|
|
116 ref_offset += count
|
|
117 seq_offset += count
|
|
118 elif op == "I":
|
|
119 # Give them all an in-between reference position
|
|
120 # (Python lets us mix integers and floats, wouldn't work in C)
|
|
121 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
|
|
122 yield ref_offset - 0.5, base
|
|
123 # Does not change ref_offset
|
|
124 seq_offset += count
|
|
125 elif op in "DN":
|
|
126 # Deletion/skip,
|
|
127 # TODO: Option to return gap characters
|
|
128 # for i in range(count):
|
|
129 # yield ref_offset + i, "-"
|
|
130 ref_offset += count
|
|
131 elif op == "S":
|
|
132 # Soft clipping, silently discard the bases (OK?)
|
|
133 seq_offset += count
|
|
134 elif op in "HP":
|
|
135 # Hard trimming or pad, can ignore
|
|
136 # TODO: Yield "-" if later offer to report deletions
|
|
137 pass
|
|
138 else:
|
|
139 raise NotImplementedError("Unexpected CIGAR operator %s" % op)
|
|
140
|
|
141 assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
|
|
142 assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
|
|
143 assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")]
|
|
144 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")]
|
|
145 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')]
|
|
146 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')]
|
|
147 assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
|
|
148 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
|
|
149 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
|
|
150 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
|
|
151
|
|
152
|
|
153 def get_roi(seq, cigar_ops, start, end):
|
|
154 """Extract region of seq mapping to the ROI.
|
|
155
|
|
156 Expect start and end to be zero based Python style end points.
|
|
157
|
|
158 i.e. The ROI relative to the mapping start recorded in the POS field.
|
|
159 Will return part of the SAM/BAM value SEQ based on interpretting the
|
|
160 passed CIGAR operators.
|
|
161 """
|
|
162 if len(cigar_ops) == 1 and cigar_ops[0][1] in "M=X":
|
|
163 # Easy case, note start/end/pos all one-based
|
|
164 assert cigar_ops[0][0] == len(seq)
|
|
165 return seq[start:end]
|
|
166 # Would use "start <= i < end" if they were all integers, but
|
|
167 # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8.
|
|
168 return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1)
|
|
169
|
|
170 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8)
|
|
171 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8)
|
|
172 assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8)
|
|
173 assert "GGGG" == get_roi("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"), 4, 8)
|
|
174 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8)
|
|
175 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8)
|
|
176
|
|
177
|
|
178 def count_region():
|
|
179 # Could recreate the region string (with no commas in start/end)?
|
|
180 # region = "%s:%i-%i" % (ref, start, end)
|
|
181
|
|
182 tally = dict()
|
|
183
|
|
184 # Call samtools view, don't need header so no -h added.
|
|
185 # Only want mapped reads, thus flag filter -F 4.
|
|
186 child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region],
|
|
187 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
188 for line in child.stdout:
|
|
189 assert line[0] != "@", "Got unexpected SAM header line: %s" % line
|
|
190 qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, rest = line.split("\t", 10)
|
|
191 pos = int(pos) # one-based
|
|
192 if start < pos:
|
|
193 # Does not span the ROI
|
|
194 continue
|
|
195 cigar_ops = decode_cigar(cigar)
|
|
196 if pos + align_len(cigar_ops) - 1 < end:
|
|
197 # Does not span the ROI
|
|
198 continue
|
|
199 # All of start/end/pos are currently one-based, making offsets Python style....
|
|
200 roi_seq = get_roi(seq, cigar_ops, start - pos, end - pos + 1)
|
|
201 assert roi_seq, "Error, empty ROI sequence for: %s" % line
|
|
202 try:
|
|
203 tally[roi_seq] += 1
|
|
204 except KeyError:
|
|
205 tally[roi_seq] = 1
|
|
206
|
|
207 stderr = child.stderr.read()
|
|
208 child.stdout.close()
|
|
209 child.stderr.close()
|
|
210 return_code = child.wait()
|
|
211 if return_code:
|
|
212 sys.exit("Got return code %i from samtools view" % return_code)
|
|
213 elif "specifies an unknown reference name. Continue anyway." in stderr:
|
|
214 sys.exit(stderr.strip() + "\n\nERROR: samtools did not recognise the region requested, can't count any variants.")
|
|
215
|
|
216 return tally
|
|
217
|
|
218
|
|
219 def record_counts():
|
|
220
|
|
221 tally = count_region()
|
|
222 total = sum(tally.values())
|
|
223
|
|
224 # Using negative count to get sort with highest count first,
|
|
225 # while tie-breaking by the ROI sequence alphabetically.
|
|
226 table = sorted((-count, roi_seq) for (roi_seq, count) in tally.items())
|
|
227 del tally
|
|
228
|
|
229 with open(tabular_filename, "w") as handle:
|
|
230 handle.write("Variant\tCount\tPercentage\n")
|
|
231 for count, roi_seq in table:
|
|
232 handle.write("%s\t%i\t%0.2f\n" % (roi_seq, -count, -count * 100.0 / total))
|
|
233
|
|
234 print("Counted %i variants from %i reads spanning %s" % (len(table), total, region))
|
|
235
|
|
236
|
|
237 # Run it!
|
|
238 record_counts()
|
|
239 # Remove the temp symlinks and files:
|
|
240 clean_up()
|