comparison tools/count_roi_variants/count_roi_variants.py @ 0:95efbdb72961 draft

v0.0.4 - Previously only on Test Tool Shed
author peterjc
date Wed, 01 Feb 2017 07:10:26 -0500
parents
children 3ee6f4d0ac80
comparison
equal deleted inserted replaced
-1:000000000000 0:95efbdb72961
1 #!/usr/bin/env python
2 """Count sequence variants in region of interest in a BAM file.
3
4 This script takes exactly four command line arguments:
5 * Input BAM filename
6 * Input BAI filename (via Galaxy metadata)
7 * Output tabular filename
8 * Region of interest (reference:start-end as used in samtools)
9
10 This messes about with the filenames to make samtools happy, then
11 runs "samtools view" and parses the reads mapped to the ROI, counts
12 the observed variants spanning the ROI, and outputs this as a
13 tabular file.
14 """
15 import sys
16 import os
17 import subprocess
18 import tempfile
19
20 if "-v" in sys.argv or "--version" in sys.argv:
21 # Galaxy seems to invert the order of the two lines
22 print("BAM coverage statistics v0.0.4 (using samtools)")
23 cmd = "samtools 2>&1 | grep -i ^Version"
24 sys.exit(os.system(cmd))
25
26 # TODO - Proper command line API
27 usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region
28
29 For ease of use, you can use a minus sign as the BAI filename which will use the BAM
30 filename with the suffix .bai added to it. Example using one of the test-data files:
31
32 $ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725"
33 Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725
34 $ cat counts.txt
35 Variant Count Percentage
36 AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50
37 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25
38 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25
39 """
40 if len(sys.argv) == 5:
41 bam_filename, bai_filename, tabular_filename, region = sys.argv[1:]
42 else:
43 sys.exit(usage)
44
45 if not os.path.isfile(bam_filename):
46 sys.exit("Input BAM file not found: %s" % bam_filename)
47 if bai_filename == "-":
48 # Make this optional for ease of use at the command line by hand:
49 if os.path.isfile(bam_filename + ".bai"):
50 bai_filename = bam_filename + ".bai"
51 if not os.path.isfile(bai_filename):
52 if bai_filename == "None":
53 sys.exit("Error: Galaxy did not index your BAM file")
54 sys.exit("Input BAI file not found: %s" % bai_filename)
55
56 try:
57 # Sanity check we have "ref:start-end" to give clear error message
58 # Note can have semi-colon in the reference name
59 # Note can have thousand separator commas in the start/end
60 ref, start_end = region.rsplit(":", 1)
61 start, end = start_end.split("-")
62 start = int(start.replace(",", ""))
63 end = int(end.replace(",", ""))
64 except ValueError:
65 sys.exit("Bad argument for region: %r" % region)
66
67
68 # Assign sensible names with real extensions, and setup symlinks:
69 tmp_dir = tempfile.mkdtemp()
70 bam_file = os.path.join(tmp_dir, "temp.bam")
71 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
72 os.symlink(os.path.abspath(bam_filename), bam_file)
73 os.symlink(os.path.abspath(bai_filename), bai_file)
74 assert os.path.isfile(bam_file), bam_file
75 assert os.path.isfile(bai_file), bai_file
76 assert os.path.isfile(bam_file + ".bai"), bam_file
77
78
79 def clean_up():
80 os.remove(bam_file)
81 os.remove(bai_file)
82 os.rmdir(tmp_dir)
83
84
85 def decode_cigar(cigar):
86 """Returns a list of 2-tuples, integer count and operator char."""
87 count = ""
88 answer = []
89 for letter in cigar:
90 if letter.isdigit():
91 count += letter # string addition
92 elif letter in "MIDNSHP=X":
93 answer.append((int(count), letter))
94 count = ""
95 else:
96 raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
97 return answer
98
99
100 assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')]
101
102
103 def align_len(cigar_ops):
104 """Sums the CIGAR M/=/X/D/N operators."""
105 return sum(count for count, op in cigar_ops if op in "M=XDN")
106
107
108 def expand_cigar(seq, cigar_ops):
109 """Yields (ref_offset, seq_base) pairs."""
110 ref_offset = 0
111 seq_offset = 0
112 for count, op in cigar_ops:
113 if op in "MX=":
114 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
115 yield ref_offset + i, base
116 ref_offset += count
117 seq_offset += count
118 elif op == "I":
119 # Give them all an in-between reference position
120 # (Python lets us mix integers and floats, wouldn't work in C)
121 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
122 yield ref_offset - 0.5, base
123 # Does not change ref_offset
124 seq_offset += count
125 elif op in "DN":
126 # Deletion/skip,
127 # TODO: Option to return gap characters
128 # for i in range(count):
129 # yield ref_offset + i, "-"
130 ref_offset += count
131 elif op == "S":
132 # Soft clipping, silently discard the bases (OK?)
133 seq_offset += count
134 elif op in "HP":
135 # Hard trimming or pad, can ignore
136 # TODO: Yield "-" if later offer to report deletions
137 pass
138 else:
139 raise NotImplementedError("Unexpected CIGAR operator %s" % op)
140
141 assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
142 assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
143 assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")]
144 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")]
145 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')]
146 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')]
147 assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
148 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
149 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
150 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
151
152
153 def get_roi(seq, cigar_ops, start, end):
154 """Extract region of seq mapping to the ROI.
155
156 Expect start and end to be zero based Python style end points.
157
158 i.e. The ROI relative to the mapping start recorded in the POS field.
159 Will return part of the SAM/BAM value SEQ based on interpretting the
160 passed CIGAR operators.
161 """
162 if len(cigar_ops) == 1 and cigar_ops[0][1] in "M=X":
163 # Easy case, note start/end/pos all one-based
164 assert cigar_ops[0][0] == len(seq)
165 return seq[start:end]
166 # Would use "start <= i < end" if they were all integers, but
167 # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8.
168 return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1)
169
170 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8)
171 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8)
172 assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8)
173 assert "GGGG" == get_roi("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"), 4, 8)
174 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8)
175 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8)
176
177
178 def count_region():
179 # Could recreate the region string (with no commas in start/end)?
180 # region = "%s:%i-%i" % (ref, start, end)
181
182 tally = dict()
183
184 # Call samtools view, don't need header so no -h added.
185 # Only want mapped reads, thus flag filter -F 4.
186 child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region],
187 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
188 for line in child.stdout:
189 assert line[0] != "@", "Got unexpected SAM header line: %s" % line
190 qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, rest = line.split("\t", 10)
191 pos = int(pos) # one-based
192 if start < pos:
193 # Does not span the ROI
194 continue
195 cigar_ops = decode_cigar(cigar)
196 if pos + align_len(cigar_ops) - 1 < end:
197 # Does not span the ROI
198 continue
199 # All of start/end/pos are currently one-based, making offsets Python style....
200 roi_seq = get_roi(seq, cigar_ops, start - pos, end - pos + 1)
201 assert roi_seq, "Error, empty ROI sequence for: %s" % line
202 try:
203 tally[roi_seq] += 1
204 except KeyError:
205 tally[roi_seq] = 1
206
207 stderr = child.stderr.read()
208 child.stdout.close()
209 child.stderr.close()
210 return_code = child.wait()
211 if return_code:
212 sys.exit("Got return code %i from samtools view" % return_code)
213 elif "specifies an unknown reference name. Continue anyway." in stderr:
214 sys.exit(stderr.strip() + "\n\nERROR: samtools did not recognise the region requested, can't count any variants.")
215
216 return tally
217
218
219 def record_counts():
220
221 tally = count_region()
222 total = sum(tally.values())
223
224 # Using negative count to get sort with highest count first,
225 # while tie-breaking by the ROI sequence alphabetically.
226 table = sorted((-count, roi_seq) for (roi_seq, count) in tally.items())
227 del tally
228
229 with open(tabular_filename, "w") as handle:
230 handle.write("Variant\tCount\tPercentage\n")
231 for count, roi_seq in table:
232 handle.write("%s\t%i\t%0.2f\n" % (roi_seq, -count, -count * 100.0 / total))
233
234 print("Counted %i variants from %i reads spanning %s" % (len(table), total, region))
235
236
237 # Run it!
238 record_counts()
239 # Remove the temp symlinks and files:
240 clean_up()