annotate tools/count_roi_variants/count_roi_variants.py @ 0:95efbdb72961 draft

v0.0.4 - Previously only on Test Tool Shed
author peterjc
date Wed, 01 Feb 2017 07:10:26 -0500
parents
children 3ee6f4d0ac80
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
2 """Count sequence variants in region of interest in a BAM file.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
3
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
4 This script takes exactly four command line arguments:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
5 * Input BAM filename
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
6 * Input BAI filename (via Galaxy metadata)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
7 * Output tabular filename
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
8 * Region of interest (reference:start-end as used in samtools)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
9
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
10 This messes about with the filenames to make samtools happy, then
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
11 runs "samtools view" and parses the reads mapped to the ROI, counts
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
12 the observed variants spanning the ROI, and outputs this as a
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
13 tabular file.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
14 """
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
15 import sys
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
16 import os
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
17 import subprocess
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
18 import tempfile
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
19
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
20 if "-v" in sys.argv or "--version" in sys.argv:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
21 # Galaxy seems to invert the order of the two lines
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
22 print("BAM coverage statistics v0.0.4 (using samtools)")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
23 cmd = "samtools 2>&1 | grep -i ^Version"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
24 sys.exit(os.system(cmd))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
25
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
26 # TODO - Proper command line API
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
27 usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
28
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
29 For ease of use, you can use a minus sign as the BAI filename which will use the BAM
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
30 filename with the suffix .bai added to it. Example using one of the test-data files:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
31
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
32 $ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
33 Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
34 $ cat counts.txt
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
35 Variant Count Percentage
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
36 AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
37 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
38 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
39 """
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
40 if len(sys.argv) == 5:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
41 bam_filename, bai_filename, tabular_filename, region = sys.argv[1:]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
42 else:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
43 sys.exit(usage)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
44
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
45 if not os.path.isfile(bam_filename):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
46 sys.exit("Input BAM file not found: %s" % bam_filename)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
47 if bai_filename == "-":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
48 # Make this optional for ease of use at the command line by hand:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
49 if os.path.isfile(bam_filename + ".bai"):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
50 bai_filename = bam_filename + ".bai"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
51 if not os.path.isfile(bai_filename):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
52 if bai_filename == "None":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
53 sys.exit("Error: Galaxy did not index your BAM file")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
54 sys.exit("Input BAI file not found: %s" % bai_filename)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
55
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
56 try:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
57 # Sanity check we have "ref:start-end" to give clear error message
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
58 # Note can have semi-colon in the reference name
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
59 # Note can have thousand separator commas in the start/end
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
60 ref, start_end = region.rsplit(":", 1)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
61 start, end = start_end.split("-")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
62 start = int(start.replace(",", ""))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
63 end = int(end.replace(",", ""))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
64 except ValueError:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
65 sys.exit("Bad argument for region: %r" % region)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
66
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
67
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
68 # Assign sensible names with real extensions, and setup symlinks:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
69 tmp_dir = tempfile.mkdtemp()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
70 bam_file = os.path.join(tmp_dir, "temp.bam")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
71 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
72 os.symlink(os.path.abspath(bam_filename), bam_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
73 os.symlink(os.path.abspath(bai_filename), bai_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
74 assert os.path.isfile(bam_file), bam_file
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
75 assert os.path.isfile(bai_file), bai_file
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
76 assert os.path.isfile(bam_file + ".bai"), bam_file
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
77
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
78
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
79 def clean_up():
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
80 os.remove(bam_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
81 os.remove(bai_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
82 os.rmdir(tmp_dir)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
83
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
84
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
85 def decode_cigar(cigar):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
86 """Returns a list of 2-tuples, integer count and operator char."""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
87 count = ""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
88 answer = []
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
89 for letter in cigar:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
90 if letter.isdigit():
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
91 count += letter # string addition
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
92 elif letter in "MIDNSHP=X":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
93 answer.append((int(count), letter))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
94 count = ""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
95 else:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
96 raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
97 return answer
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
98
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
99
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
100 assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
101
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
102
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
103 def align_len(cigar_ops):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
104 """Sums the CIGAR M/=/X/D/N operators."""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
105 return sum(count for count, op in cigar_ops if op in "M=XDN")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
106
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
107
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
108 def expand_cigar(seq, cigar_ops):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
109 """Yields (ref_offset, seq_base) pairs."""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
110 ref_offset = 0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
111 seq_offset = 0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
112 for count, op in cigar_ops:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
113 if op in "MX=":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
114 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
115 yield ref_offset + i, base
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
116 ref_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
117 seq_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
118 elif op == "I":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
119 # Give them all an in-between reference position
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
120 # (Python lets us mix integers and floats, wouldn't work in C)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
121 for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
122 yield ref_offset - 0.5, base
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
123 # Does not change ref_offset
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
124 seq_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
125 elif op in "DN":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
126 # Deletion/skip,
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
127 # TODO: Option to return gap characters
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
128 # for i in range(count):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
129 # yield ref_offset + i, "-"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
130 ref_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
131 elif op == "S":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
132 # Soft clipping, silently discard the bases (OK?)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
133 seq_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
134 elif op in "HP":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
135 # Hard trimming or pad, can ignore
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
136 # TODO: Yield "-" if later offer to report deletions
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
137 pass
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
138 else:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
139 raise NotImplementedError("Unexpected CIGAR operator %s" % op)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
140
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
141 assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
142 assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
143 assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
144 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
145 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
146 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
147 assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
148 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
149 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
150 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
151
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
152
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
153 def get_roi(seq, cigar_ops, start, end):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
154 """Extract region of seq mapping to the ROI.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
155
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
156 Expect start and end to be zero based Python style end points.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
157
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
158 i.e. The ROI relative to the mapping start recorded in the POS field.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
159 Will return part of the SAM/BAM value SEQ based on interpretting the
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
160 passed CIGAR operators.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
161 """
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
162 if len(cigar_ops) == 1 and cigar_ops[0][1] in "M=X":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
163 # Easy case, note start/end/pos all one-based
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
164 assert cigar_ops[0][0] == len(seq)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
165 return seq[start:end]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
166 # Would use "start <= i < end" if they were all integers, but
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
167 # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
168 return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
169
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
170 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
171 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
172 assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
173 assert "GGGG" == get_roi("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
174 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
175 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
176
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
177
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
178 def count_region():
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
179 # Could recreate the region string (with no commas in start/end)?
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
180 # region = "%s:%i-%i" % (ref, start, end)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
181
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
182 tally = dict()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
183
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
184 # Call samtools view, don't need header so no -h added.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
185 # Only want mapped reads, thus flag filter -F 4.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
186 child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region],
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
187 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
188 for line in child.stdout:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
189 assert line[0] != "@", "Got unexpected SAM header line: %s" % line
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
190 qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, rest = line.split("\t", 10)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
191 pos = int(pos) # one-based
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
192 if start < pos:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
193 # Does not span the ROI
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
194 continue
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
195 cigar_ops = decode_cigar(cigar)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
196 if pos + align_len(cigar_ops) - 1 < end:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
197 # Does not span the ROI
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
198 continue
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
199 # All of start/end/pos are currently one-based, making offsets Python style....
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
200 roi_seq = get_roi(seq, cigar_ops, start - pos, end - pos + 1)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
201 assert roi_seq, "Error, empty ROI sequence for: %s" % line
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
202 try:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
203 tally[roi_seq] += 1
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
204 except KeyError:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
205 tally[roi_seq] = 1
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
206
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
207 stderr = child.stderr.read()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
208 child.stdout.close()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
209 child.stderr.close()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
210 return_code = child.wait()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
211 if return_code:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
212 sys.exit("Got return code %i from samtools view" % return_code)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
213 elif "specifies an unknown reference name. Continue anyway." in stderr:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
214 sys.exit(stderr.strip() + "\n\nERROR: samtools did not recognise the region requested, can't count any variants.")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
215
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
216 return tally
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
217
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
218
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
219 def record_counts():
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
220
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
221 tally = count_region()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
222 total = sum(tally.values())
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
223
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
224 # Using negative count to get sort with highest count first,
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
225 # while tie-breaking by the ROI sequence alphabetically.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
226 table = sorted((-count, roi_seq) for (roi_seq, count) in tally.items())
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
227 del tally
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
228
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
229 with open(tabular_filename, "w") as handle:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
230 handle.write("Variant\tCount\tPercentage\n")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
231 for count, roi_seq in table:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
232 handle.write("%s\t%i\t%0.2f\n" % (roi_seq, -count, -count * 100.0 / total))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
233
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
234 print("Counted %i variants from %i reads spanning %s" % (len(table), total, region))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
235
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
236
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
237 # Run it!
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
238 record_counts()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
239 # Remove the temp symlinks and files:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
240 clean_up()