Mercurial > repos > peterjc > count_roi_variants
comparison tools/count_roi_variants/count_roi_variants.py @ 0:95efbdb72961 draft
v0.0.4 - Previously only on Test Tool Shed
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 07:10:26 -0500 |
parents | |
children | 3ee6f4d0ac80 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:95efbdb72961 |
---|---|
1 #!/usr/bin/env python | |
2 """Count sequence variants in region of interest in a BAM file. | |
3 | |
4 This script takes exactly four command line arguments: | |
5 * Input BAM filename | |
6 * Input BAI filename (via Galaxy metadata) | |
7 * Output tabular filename | |
8 * Region of interest (reference:start-end as used in samtools) | |
9 | |
10 This messes about with the filenames to make samtools happy, then | |
11 runs "samtools view" and parses the reads mapped to the ROI, counts | |
12 the observed variants spanning the ROI, and outputs this as a | |
13 tabular file. | |
14 """ | |
15 import sys | |
16 import os | |
17 import subprocess | |
18 import tempfile | |
19 | |
20 if "-v" in sys.argv or "--version" in sys.argv: | |
21 # Galaxy seems to invert the order of the two lines | |
22 print("BAM coverage statistics v0.0.4 (using samtools)") | |
23 cmd = "samtools 2>&1 | grep -i ^Version" | |
24 sys.exit(os.system(cmd)) | |
25 | |
26 # TODO - Proper command line API | |
27 usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region | |
28 | |
29 For ease of use, you can use a minus sign as the BAI filename which will use the BAM | |
30 filename with the suffix .bai added to it. Example using one of the test-data files: | |
31 | |
32 $ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725" | |
33 Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725 | |
34 $ cat counts.txt | |
35 Variant Count Percentage | |
36 AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50 | |
37 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25 | |
38 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25 | |
39 """ | |
40 if len(sys.argv) == 5: | |
41 bam_filename, bai_filename, tabular_filename, region = sys.argv[1:] | |
42 else: | |
43 sys.exit(usage) | |
44 | |
45 if not os.path.isfile(bam_filename): | |
46 sys.exit("Input BAM file not found: %s" % bam_filename) | |
47 if bai_filename == "-": | |
48 # Make this optional for ease of use at the command line by hand: | |
49 if os.path.isfile(bam_filename + ".bai"): | |
50 bai_filename = bam_filename + ".bai" | |
51 if not os.path.isfile(bai_filename): | |
52 if bai_filename == "None": | |
53 sys.exit("Error: Galaxy did not index your BAM file") | |
54 sys.exit("Input BAI file not found: %s" % bai_filename) | |
55 | |
56 try: | |
57 # Sanity check we have "ref:start-end" to give clear error message | |
58 # Note can have semi-colon in the reference name | |
59 # Note can have thousand separator commas in the start/end | |
60 ref, start_end = region.rsplit(":", 1) | |
61 start, end = start_end.split("-") | |
62 start = int(start.replace(",", "")) | |
63 end = int(end.replace(",", "")) | |
64 except ValueError: | |
65 sys.exit("Bad argument for region: %r" % region) | |
66 | |
67 | |
68 # Assign sensible names with real extensions, and setup symlinks: | |
69 tmp_dir = tempfile.mkdtemp() | |
70 bam_file = os.path.join(tmp_dir, "temp.bam") | |
71 bai_file = os.path.join(tmp_dir, "temp.bam.bai") | |
72 os.symlink(os.path.abspath(bam_filename), bam_file) | |
73 os.symlink(os.path.abspath(bai_filename), bai_file) | |
74 assert os.path.isfile(bam_file), bam_file | |
75 assert os.path.isfile(bai_file), bai_file | |
76 assert os.path.isfile(bam_file + ".bai"), bam_file | |
77 | |
78 | |
79 def clean_up(): | |
80 os.remove(bam_file) | |
81 os.remove(bai_file) | |
82 os.rmdir(tmp_dir) | |
83 | |
84 | |
85 def decode_cigar(cigar): | |
86 """Returns a list of 2-tuples, integer count and operator char.""" | |
87 count = "" | |
88 answer = [] | |
89 for letter in cigar: | |
90 if letter.isdigit(): | |
91 count += letter # string addition | |
92 elif letter in "MIDNSHP=X": | |
93 answer.append((int(count), letter)) | |
94 count = "" | |
95 else: | |
96 raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar)) | |
97 return answer | |
98 | |
99 | |
100 assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')] | |
101 | |
102 | |
103 def align_len(cigar_ops): | |
104 """Sums the CIGAR M/=/X/D/N operators.""" | |
105 return sum(count for count, op in cigar_ops if op in "M=XDN") | |
106 | |
107 | |
108 def expand_cigar(seq, cigar_ops): | |
109 """Yields (ref_offset, seq_base) pairs.""" | |
110 ref_offset = 0 | |
111 seq_offset = 0 | |
112 for count, op in cigar_ops: | |
113 if op in "MX=": | |
114 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]): | |
115 yield ref_offset + i, base | |
116 ref_offset += count | |
117 seq_offset += count | |
118 elif op == "I": | |
119 # Give them all an in-between reference position | |
120 # (Python lets us mix integers and floats, wouldn't work in C) | |
121 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]): | |
122 yield ref_offset - 0.5, base | |
123 # Does not change ref_offset | |
124 seq_offset += count | |
125 elif op in "DN": | |
126 # Deletion/skip, | |
127 # TODO: Option to return gap characters | |
128 # for i in range(count): | |
129 # yield ref_offset + i, "-" | |
130 ref_offset += count | |
131 elif op == "S": | |
132 # Soft clipping, silently discard the bases (OK?) | |
133 seq_offset += count | |
134 elif op in "HP": | |
135 # Hard trimming or pad, can ignore | |
136 # TODO: Yield "-" if later offer to report deletions | |
137 pass | |
138 else: | |
139 raise NotImplementedError("Unexpected CIGAR operator %s" % op) | |
140 | |
141 assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")] | |
142 assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")] | |
143 assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")] | |
144 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")] | |
145 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')] | |
146 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')] | |
147 assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] | |
148 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] | |
149 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] | |
150 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] | |
151 | |
152 | |
153 def get_roi(seq, cigar_ops, start, end): | |
154 """Extract region of seq mapping to the ROI. | |
155 | |
156 Expect start and end to be zero based Python style end points. | |
157 | |
158 i.e. The ROI relative to the mapping start recorded in the POS field. | |
159 Will return part of the SAM/BAM value SEQ based on interpretting the | |
160 passed CIGAR operators. | |
161 """ | |
162 if len(cigar_ops) == 1 and cigar_ops[0][1] in "M=X": | |
163 # Easy case, note start/end/pos all one-based | |
164 assert cigar_ops[0][0] == len(seq) | |
165 return seq[start:end] | |
166 # Would use "start <= i < end" if they were all integers, but | |
167 # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. | |
168 return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1) | |
169 | |
170 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8) | |
171 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8) | |
172 assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8) | |
173 assert "GGGG" == get_roi("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"), 4, 8) | |
174 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8) | |
175 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8) | |
176 | |
177 | |
178 def count_region(): | |
179 # Could recreate the region string (with no commas in start/end)? | |
180 # region = "%s:%i-%i" % (ref, start, end) | |
181 | |
182 tally = dict() | |
183 | |
184 # Call samtools view, don't need header so no -h added. | |
185 # Only want mapped reads, thus flag filter -F 4. | |
186 child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region], | |
187 stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
188 for line in child.stdout: | |
189 assert line[0] != "@", "Got unexpected SAM header line: %s" % line | |
190 qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, rest = line.split("\t", 10) | |
191 pos = int(pos) # one-based | |
192 if start < pos: | |
193 # Does not span the ROI | |
194 continue | |
195 cigar_ops = decode_cigar(cigar) | |
196 if pos + align_len(cigar_ops) - 1 < end: | |
197 # Does not span the ROI | |
198 continue | |
199 # All of start/end/pos are currently one-based, making offsets Python style.... | |
200 roi_seq = get_roi(seq, cigar_ops, start - pos, end - pos + 1) | |
201 assert roi_seq, "Error, empty ROI sequence for: %s" % line | |
202 try: | |
203 tally[roi_seq] += 1 | |
204 except KeyError: | |
205 tally[roi_seq] = 1 | |
206 | |
207 stderr = child.stderr.read() | |
208 child.stdout.close() | |
209 child.stderr.close() | |
210 return_code = child.wait() | |
211 if return_code: | |
212 sys.exit("Got return code %i from samtools view" % return_code) | |
213 elif "specifies an unknown reference name. Continue anyway." in stderr: | |
214 sys.exit(stderr.strip() + "\n\nERROR: samtools did not recognise the region requested, can't count any variants.") | |
215 | |
216 return tally | |
217 | |
218 | |
219 def record_counts(): | |
220 | |
221 tally = count_region() | |
222 total = sum(tally.values()) | |
223 | |
224 # Using negative count to get sort with highest count first, | |
225 # while tie-breaking by the ROI sequence alphabetically. | |
226 table = sorted((-count, roi_seq) for (roi_seq, count) in tally.items()) | |
227 del tally | |
228 | |
229 with open(tabular_filename, "w") as handle: | |
230 handle.write("Variant\tCount\tPercentage\n") | |
231 for count, roi_seq in table: | |
232 handle.write("%s\t%i\t%0.2f\n" % (roi_seq, -count, -count * 100.0 / total)) | |
233 | |
234 print("Counted %i variants from %i reads spanning %s" % (len(table), total, region)) | |
235 | |
236 | |
237 # Run it! | |
238 record_counts() | |
239 # Remove the temp symlinks and files: | |
240 clean_up() |