Mercurial > repos > peterjc > count_roi_variants
annotate tools/count_roi_variants/count_roi_variants.py @ 2:167765a633c0 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:34:00 +0000 |
parents | 3ee6f4d0ac80 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 """Count sequence variants in region of interest in a BAM file. | |
3 | |
4 This script takes exactly four command line arguments: | |
5 * Input BAM filename | |
6 * Input BAI filename (via Galaxy metadata) | |
7 * Output tabular filename | |
8 * Region of interest (reference:start-end as used in samtools) | |
9 | |
10 This messes about with the filenames to make samtools happy, then | |
11 runs "samtools view" and parses the reads mapped to the ROI, counts | |
12 the observed variants spanning the ROI, and outputs this as a | |
13 tabular file. | |
14 """ | |
1
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
15 |
0 | 16 import os |
17 import subprocess | |
1
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
18 import sys |
0 | 19 import tempfile |
20 | |
21 if "-v" in sys.argv or "--version" in sys.argv: | |
22 # Galaxy seems to invert the order of the two lines | |
2 | 23 print("BAM coverage statistics v0.0.6 (using samtools)") |
0 | 24 cmd = "samtools 2>&1 | grep -i ^Version" |
25 sys.exit(os.system(cmd)) | |
26 | |
27 # TODO - Proper command line API | |
28 usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region | |
29 | |
30 For ease of use, you can use a minus sign as the BAI filename which will use the BAM | |
31 filename with the suffix .bai added to it. Example using one of the test-data files: | |
32 | |
33 $ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725" | |
34 Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725 | |
35 $ cat counts.txt | |
36 Variant Count Percentage | |
37 AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50 | |
38 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25 | |
39 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25 | |
2 | 40 """ # noqa: E501 |
41 | |
0 | 42 if len(sys.argv) == 5: |
43 bam_filename, bai_filename, tabular_filename, region = sys.argv[1:] | |
44 else: | |
45 sys.exit(usage) | |
46 | |
47 if not os.path.isfile(bam_filename): | |
48 sys.exit("Input BAM file not found: %s" % bam_filename) | |
49 if bai_filename == "-": | |
50 # Make this optional for ease of use at the command line by hand: | |
51 if os.path.isfile(bam_filename + ".bai"): | |
52 bai_filename = bam_filename + ".bai" | |
53 if not os.path.isfile(bai_filename): | |
54 if bai_filename == "None": | |
55 sys.exit("Error: Galaxy did not index your BAM file") | |
56 sys.exit("Input BAI file not found: %s" % bai_filename) | |
57 | |
58 try: | |
59 # Sanity check we have "ref:start-end" to give clear error message | |
60 # Note can have semi-colon in the reference name | |
61 # Note can have thousand separator commas in the start/end | |
62 ref, start_end = region.rsplit(":", 1) | |
63 start, end = start_end.split("-") | |
64 start = int(start.replace(",", "")) | |
65 end = int(end.replace(",", "")) | |
66 except ValueError: | |
67 sys.exit("Bad argument for region: %r" % region) | |
68 | |
69 | |
70 # Assign sensible names with real extensions, and setup symlinks: | |
71 tmp_dir = tempfile.mkdtemp() | |
72 bam_file = os.path.join(tmp_dir, "temp.bam") | |
73 bai_file = os.path.join(tmp_dir, "temp.bam.bai") | |
74 os.symlink(os.path.abspath(bam_filename), bam_file) | |
75 os.symlink(os.path.abspath(bai_filename), bai_file) | |
76 assert os.path.isfile(bam_file), bam_file | |
77 assert os.path.isfile(bai_file), bai_file | |
78 assert os.path.isfile(bam_file + ".bai"), bam_file | |
79 | |
80 | |
81 def clean_up(): | |
82 os.remove(bam_file) | |
83 os.remove(bai_file) | |
84 os.rmdir(tmp_dir) | |
85 | |
86 | |
87 def decode_cigar(cigar): | |
2 | 88 """Return a list of 2-tuples, integer count and operator char.""" |
0 | 89 count = "" |
90 answer = [] | |
91 for letter in cigar: | |
92 if letter.isdigit(): | |
93 count += letter # string addition | |
94 elif letter in "MIDNSHP=X": | |
95 answer.append((int(count), letter)) | |
96 count = "" | |
97 else: | |
98 raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar)) | |
99 return answer | |
100 | |
101 | |
2 | 102 assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [ |
103 (14, "S"), | |
104 (15, "M"), | |
105 (1, "P"), | |
106 (1, "D"), | |
107 (3, "P"), | |
108 (54, "M"), | |
109 (1, "D"), | |
110 (34, "M"), | |
111 (5, "S"), | |
112 ] | |
0 | 113 |
114 | |
115 def align_len(cigar_ops): | |
2 | 116 """Sum the CIGAR M/=/X/D/N operators.""" |
0 | 117 return sum(count for count, op in cigar_ops if op in "M=XDN") |
118 | |
119 | |
120 def expand_cigar(seq, cigar_ops): | |
2 | 121 """Yield (ref_offset, seq_base) pairs.""" |
0 | 122 ref_offset = 0 |
123 seq_offset = 0 | |
124 for count, op in cigar_ops: | |
125 if op in "MX=": | |
2 | 126 for (i, base) in enumerate(seq[seq_offset : seq_offset + count]): |
0 | 127 yield ref_offset + i, base |
128 ref_offset += count | |
129 seq_offset += count | |
130 elif op == "I": | |
131 # Give them all an in-between reference position | |
132 # (Python lets us mix integers and floats, wouldn't work in C) | |
2 | 133 for (i, base) in enumerate(seq[seq_offset : seq_offset + count]): |
0 | 134 yield ref_offset - 0.5, base |
135 # Does not change ref_offset | |
136 seq_offset += count | |
137 elif op in "DN": | |
138 # Deletion/skip, | |
139 # TODO: Option to return gap characters | |
140 # for i in range(count): | |
141 # yield ref_offset + i, "-" | |
142 ref_offset += count | |
143 elif op == "S": | |
144 # Soft clipping, silently discard the bases (OK?) | |
145 seq_offset += count | |
146 elif op in "HP": | |
147 # Hard trimming or pad, can ignore | |
148 # TODO: Yield "-" if later offer to report deletions | |
149 pass | |
150 else: | |
151 raise NotImplementedError("Unexpected CIGAR operator %s" % op) | |
152 | |
1
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
153 |
2 | 154 assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [ |
155 (0, "A"), | |
156 (1, "C"), | |
157 (2, "G"), | |
158 (3, "T"), | |
159 ] | |
160 assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [ | |
161 (0, "A"), | |
162 (1, "C"), | |
163 (2, "G"), | |
164 (3, "T"), | |
165 ] | |
166 assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [ | |
167 (0, "A"), | |
168 (1, "C"), | |
169 (3, "G"), | |
170 (4, "T"), | |
171 ] | |
172 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [ | |
173 (0, "A"), | |
174 (1, "C"), | |
175 (1.5, "t"), | |
176 (2, "G"), | |
177 (3, "T"), | |
178 ] | |
179 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [ | |
180 (-0.5, "t"), | |
181 (0, "A"), | |
182 (1, "C"), | |
183 (2, "G"), | |
184 (3, "T"), | |
185 ] | |
186 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [ | |
187 (0, "A"), | |
188 (1, "C"), | |
189 (2, "G"), | |
190 (3, "T"), | |
191 (3.5, "t"), | |
192 ] | |
193 assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [ | |
194 (0, "A"), | |
195 (1, "A"), | |
196 (2, "A"), | |
197 (3, "A"), | |
198 (4, "G"), | |
199 (5, "G"), | |
200 (6, "G"), | |
201 (7, "G"), | |
202 (8, "T"), | |
203 (9, "T"), | |
204 (10, "T"), | |
205 (11, "T"), | |
206 ] | |
207 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [ | |
208 (0, "A"), | |
209 (1, "A"), | |
210 (2, "A"), | |
211 (3, "A"), | |
212 (3.5, "c"), | |
213 (4, "G"), | |
214 (5, "G"), | |
215 (6, "G"), | |
216 (7, "G"), | |
217 (8, "T"), | |
218 (9, "T"), | |
219 (10, "T"), | |
220 (11, "T"), | |
221 ] | |
222 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [ | |
223 (0, "A"), | |
224 (1, "A"), | |
225 (2, "A"), | |
226 (3, "A"), | |
227 (4, "G"), | |
228 (5, "G"), | |
229 (6, "G"), | |
230 (7, "G"), | |
231 (7.5, "c"), | |
232 (8, "T"), | |
233 (9, "T"), | |
234 (10, "T"), | |
235 (11, "T"), | |
236 ] | |
237 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [ | |
238 (0, "A"), | |
239 (1, "A"), | |
240 (2, "A"), | |
241 (3, "A"), | |
242 (3.5, "c"), | |
243 (4, "G"), | |
244 (5, "G"), | |
245 (6, "G"), | |
246 (7, "G"), | |
247 (7.5, "c"), | |
248 (8, "T"), | |
249 (9, "T"), | |
250 (10, "T"), | |
251 (11, "T"), | |
252 ] | |
0 | 253 |
254 | |
255 def get_roi(seq, cigar_ops, start, end): | |
256 """Extract region of seq mapping to the ROI. | |
257 | |
258 Expect start and end to be zero based Python style end points. | |
259 | |
260 i.e. The ROI relative to the mapping start recorded in the POS field. | |
261 Will return part of the SAM/BAM value SEQ based on interpretting the | |
262 passed CIGAR operators. | |
263 """ | |
264 if len(cigar_ops) == 1 and cigar_ops[0][1] in "M=X": | |
265 # Easy case, note start/end/pos all one-based | |
266 assert cigar_ops[0][0] == len(seq) | |
267 return seq[start:end] | |
268 # Would use "start <= i < end" if they were all integers, but | |
269 # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. | |
2 | 270 return "".join( |
271 base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1 | |
272 ) | |
0 | 273 |
1
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
274 |
0 | 275 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8) |
276 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8) | |
277 assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8) | |
278 assert "GGGG" == get_roi("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"), 4, 8) | |
279 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8) | |
280 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8) | |
281 | |
282 | |
283 def count_region(): | |
1
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
284 """Count reads mapped to the region. |
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
285 |
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
286 Calls samtools view and parses the output. |
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
287 """ |
0 | 288 # Could recreate the region string (with no commas in start/end)? |
289 # region = "%s:%i-%i" % (ref, start, end) | |
290 | |
2 | 291 tally = {} |
0 | 292 |
293 # Call samtools view, don't need header so no -h added. | |
294 # Only want mapped reads, thus flag filter -F 4. | |
2 | 295 child = subprocess.Popen( |
296 ["samtools", "view", "-F", "4", bam_file, region], | |
297 universal_newlines=True, | |
298 stdout=subprocess.PIPE, | |
299 stderr=subprocess.PIPE, | |
300 ) | |
0 | 301 for line in child.stdout: |
302 assert line[0] != "@", "Got unexpected SAM header line: %s" % line | |
2 | 303 ( |
304 qname, | |
305 flag, | |
306 rname, | |
307 pos, | |
308 mapq, | |
309 cigar, | |
310 rnext, | |
311 pnext, | |
312 tlen, | |
313 seq, | |
314 rest, | |
315 ) = line.split("\t", 10) | |
0 | 316 pos = int(pos) # one-based |
317 if start < pos: | |
318 # Does not span the ROI | |
319 continue | |
320 cigar_ops = decode_cigar(cigar) | |
321 if pos + align_len(cigar_ops) - 1 < end: | |
322 # Does not span the ROI | |
323 continue | |
324 # All of start/end/pos are currently one-based, making offsets Python style.... | |
325 roi_seq = get_roi(seq, cigar_ops, start - pos, end - pos + 1) | |
326 assert roi_seq, "Error, empty ROI sequence for: %s" % line | |
327 try: | |
328 tally[roi_seq] += 1 | |
329 except KeyError: | |
330 tally[roi_seq] = 1 | |
331 | |
332 stderr = child.stderr.read() | |
333 child.stdout.close() | |
334 child.stderr.close() | |
335 return_code = child.wait() | |
336 if return_code: | |
337 sys.exit("Got return code %i from samtools view" % return_code) | |
338 elif "specifies an unknown reference name. Continue anyway." in stderr: | |
2 | 339 sys.exit( |
340 stderr.strip() | |
341 + "\n\nERROR: samtools did not recognise the region requested, " | |
342 "can't count any variants." | |
343 ) | |
0 | 344 |
345 return tally | |
346 | |
347 | |
348 def record_counts(): | |
1
3ee6f4d0ac80
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents:
0
diff
changeset
|
349 """Record counts to tabular file.""" |
0 | 350 tally = count_region() |
351 total = sum(tally.values()) | |
352 | |
353 # Using negative count to get sort with highest count first, | |
354 # while tie-breaking by the ROI sequence alphabetically. | |
355 table = sorted((-count, roi_seq) for (roi_seq, count) in tally.items()) | |
356 del tally | |
357 | |
358 with open(tabular_filename, "w") as handle: | |
359 handle.write("Variant\tCount\tPercentage\n") | |
360 for count, roi_seq in table: | |
361 handle.write("%s\t%i\t%0.2f\n" % (roi_seq, -count, -count * 100.0 / total)) | |
362 | |
363 print("Counted %i variants from %i reads spanning %s" % (len(table), total, region)) | |
364 | |
365 | |
366 # Run it! | |
367 record_counts() | |
368 # Remove the temp symlinks and files: | |
369 clean_up() |