annotate tools/count_roi_variants/count_roi_variants.py @ 2:167765a633c0 draft default tip

"Update all the pico_galaxy tools on main Tool Shed"
author peterjc
date Fri, 16 Apr 2021 22:34:00 +0000
parents 3ee6f4d0ac80
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
2 """Count sequence variants in region of interest in a BAM file.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
3
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
4 This script takes exactly four command line arguments:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
5 * Input BAM filename
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
6 * Input BAI filename (via Galaxy metadata)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
7 * Output tabular filename
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
8 * Region of interest (reference:start-end as used in samtools)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
9
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
10 This messes about with the filenames to make samtools happy, then
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
11 runs "samtools view" and parses the reads mapped to the ROI, counts
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
12 the observed variants spanning the ROI, and outputs this as a
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
13 tabular file.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
14 """
1
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
15
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
16 import os
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
17 import subprocess
1
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
18 import sys
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
19 import tempfile
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
20
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
21 if "-v" in sys.argv or "--version" in sys.argv:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
22 # Galaxy seems to invert the order of the two lines
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
23 print("BAM coverage statistics v0.0.6 (using samtools)")
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
24 cmd = "samtools 2>&1 | grep -i ^Version"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
25 sys.exit(os.system(cmd))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
26
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
27 # TODO - Proper command line API
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
28 usage = """Requires 4 arguments: BAM, BAI, tabular filename, samtools-style region
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
29
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
30 For ease of use, you can use a minus sign as the BAI filename which will use the BAM
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
31 filename with the suffix .bai added to it. Example using one of the test-data files:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
32
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
33 $ count_roi_variants.py "SRR639755_mito_pairs_vs_NC_010642_clc.bam" "-" "counts.txt" "gi|187250362|ref|NC_010642.1|:1695-1725"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
34 Counted 3 variants from 16 reads spanning gi|187250362|ref|NC_010642.1|:1695-1725
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
35 $ cat counts.txt
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
36 Variant Count Percentage
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
37 AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
38 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
39 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
40 """ # noqa: E501
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
41
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
42 if len(sys.argv) == 5:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
43 bam_filename, bai_filename, tabular_filename, region = sys.argv[1:]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
44 else:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
45 sys.exit(usage)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
46
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
47 if not os.path.isfile(bam_filename):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
48 sys.exit("Input BAM file not found: %s" % bam_filename)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
49 if bai_filename == "-":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
50 # Make this optional for ease of use at the command line by hand:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
51 if os.path.isfile(bam_filename + ".bai"):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
52 bai_filename = bam_filename + ".bai"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
53 if not os.path.isfile(bai_filename):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
54 if bai_filename == "None":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
55 sys.exit("Error: Galaxy did not index your BAM file")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
56 sys.exit("Input BAI file not found: %s" % bai_filename)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
57
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
58 try:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
59 # Sanity check we have "ref:start-end" to give clear error message
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
60 # Note can have semi-colon in the reference name
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
61 # Note can have thousand separator commas in the start/end
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
62 ref, start_end = region.rsplit(":", 1)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
63 start, end = start_end.split("-")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
64 start = int(start.replace(",", ""))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
65 end = int(end.replace(",", ""))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
66 except ValueError:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
67 sys.exit("Bad argument for region: %r" % region)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
68
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
69
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
70 # Assign sensible names with real extensions, and setup symlinks:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
71 tmp_dir = tempfile.mkdtemp()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
72 bam_file = os.path.join(tmp_dir, "temp.bam")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
73 bai_file = os.path.join(tmp_dir, "temp.bam.bai")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
74 os.symlink(os.path.abspath(bam_filename), bam_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
75 os.symlink(os.path.abspath(bai_filename), bai_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
76 assert os.path.isfile(bam_file), bam_file
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
77 assert os.path.isfile(bai_file), bai_file
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
78 assert os.path.isfile(bam_file + ".bai"), bam_file
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
79
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
80
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
81 def clean_up():
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
82 os.remove(bam_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
83 os.remove(bai_file)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
84 os.rmdir(tmp_dir)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
85
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
86
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
87 def decode_cigar(cigar):
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
88 """Return a list of 2-tuples, integer count and operator char."""
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
89 count = ""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
90 answer = []
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
91 for letter in cigar:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
92 if letter.isdigit():
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
93 count += letter # string addition
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
94 elif letter in "MIDNSHP=X":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
95 answer.append((int(count), letter))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
96 count = ""
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
97 else:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
98 raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
99 return answer
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
100
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
101
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
102 assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
103 (14, "S"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
104 (15, "M"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
105 (1, "P"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
106 (1, "D"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
107 (3, "P"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
108 (54, "M"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
109 (1, "D"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
110 (34, "M"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
111 (5, "S"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
112 ]
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
113
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
114
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
115 def align_len(cigar_ops):
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
116 """Sum the CIGAR M/=/X/D/N operators."""
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
117 return sum(count for count, op in cigar_ops if op in "M=XDN")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
118
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
119
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
120 def expand_cigar(seq, cigar_ops):
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
121 """Yield (ref_offset, seq_base) pairs."""
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
122 ref_offset = 0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
123 seq_offset = 0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
124 for count, op in cigar_ops:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
125 if op in "MX=":
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
126 for (i, base) in enumerate(seq[seq_offset : seq_offset + count]):
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
127 yield ref_offset + i, base
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
128 ref_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
129 seq_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
130 elif op == "I":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
131 # Give them all an in-between reference position
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
132 # (Python lets us mix integers and floats, wouldn't work in C)
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
133 for (i, base) in enumerate(seq[seq_offset : seq_offset + count]):
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
134 yield ref_offset - 0.5, base
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
135 # Does not change ref_offset
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
136 seq_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
137 elif op in "DN":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
138 # Deletion/skip,
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
139 # TODO: Option to return gap characters
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
140 # for i in range(count):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
141 # yield ref_offset + i, "-"
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
142 ref_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
143 elif op == "S":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
144 # Soft clipping, silently discard the bases (OK?)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
145 seq_offset += count
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
146 elif op in "HP":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
147 # Hard trimming or pad, can ignore
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
148 # TODO: Yield "-" if later offer to report deletions
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
149 pass
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
150 else:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
151 raise NotImplementedError("Unexpected CIGAR operator %s" % op)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
152
1
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
153
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
154 assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
155 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
156 (1, "C"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
157 (2, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
158 (3, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
159 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
160 assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
161 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
162 (1, "C"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
163 (2, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
164 (3, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
165 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
166 assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
167 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
168 (1, "C"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
169 (3, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
170 (4, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
171 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
172 assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
173 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
174 (1, "C"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
175 (1.5, "t"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
176 (2, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
177 (3, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
178 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
179 assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
180 (-0.5, "t"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
181 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
182 (1, "C"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
183 (2, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
184 (3, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
185 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
186 assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
187 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
188 (1, "C"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
189 (2, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
190 (3, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
191 (3.5, "t"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
192 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
193 assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
194 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
195 (1, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
196 (2, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
197 (3, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
198 (4, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
199 (5, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
200 (6, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
201 (7, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
202 (8, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
203 (9, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
204 (10, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
205 (11, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
206 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
207 assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
208 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
209 (1, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
210 (2, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
211 (3, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
212 (3.5, "c"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
213 (4, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
214 (5, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
215 (6, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
216 (7, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
217 (8, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
218 (9, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
219 (10, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
220 (11, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
221 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
222 assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
223 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
224 (1, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
225 (2, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
226 (3, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
227 (4, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
228 (5, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
229 (6, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
230 (7, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
231 (7.5, "c"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
232 (8, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
233 (9, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
234 (10, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
235 (11, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
236 ]
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
237 assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
238 (0, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
239 (1, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
240 (2, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
241 (3, "A"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
242 (3.5, "c"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
243 (4, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
244 (5, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
245 (6, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
246 (7, "G"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
247 (7.5, "c"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
248 (8, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
249 (9, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
250 (10, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
251 (11, "T"),
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
252 ]
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
253
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
254
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
255 def get_roi(seq, cigar_ops, start, end):
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
256 """Extract region of seq mapping to the ROI.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
257
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
258 Expect start and end to be zero based Python style end points.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
259
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
260 i.e. The ROI relative to the mapping start recorded in the POS field.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
261 Will return part of the SAM/BAM value SEQ based on interpretting the
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
262 passed CIGAR operators.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
263 """
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
264 if len(cigar_ops) == 1 and cigar_ops[0][1] in "M=X":
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
265 # Easy case, note start/end/pos all one-based
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
266 assert cigar_ops[0][0] == len(seq)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
267 return seq[start:end]
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
268 # Would use "start <= i < end" if they were all integers, but
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
269 # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8.
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
270 return "".join(
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
271 base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
272 )
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
273
1
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
274
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
275 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
276 assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
277 assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
278 assert "GGGG" == get_roi("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
279 assert "GGaGG" == get_roi("AAAAGGaGGTTTT", decode_cigar("6M1I6M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
280 assert "GGGgA" == get_roi("AAAAGGGgATTTT", decode_cigar("7M1I5M"), 4, 8)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
281
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
282
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
283 def count_region():
1
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
284 """Count reads mapped to the region.
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
285
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
286 Calls samtools view and parses the output.
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
287 """
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
288 # Could recreate the region string (with no commas in start/end)?
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
289 # region = "%s:%i-%i" % (ref, start, end)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
290
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
291 tally = {}
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
292
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
293 # Call samtools view, don't need header so no -h added.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
294 # Only want mapped reads, thus flag filter -F 4.
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
295 child = subprocess.Popen(
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
296 ["samtools", "view", "-F", "4", bam_file, region],
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
297 universal_newlines=True,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
298 stdout=subprocess.PIPE,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
299 stderr=subprocess.PIPE,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
300 )
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
301 for line in child.stdout:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
302 assert line[0] != "@", "Got unexpected SAM header line: %s" % line
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
303 (
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
304 qname,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
305 flag,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
306 rname,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
307 pos,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
308 mapq,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
309 cigar,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
310 rnext,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
311 pnext,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
312 tlen,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
313 seq,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
314 rest,
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
315 ) = line.split("\t", 10)
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
316 pos = int(pos) # one-based
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
317 if start < pos:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
318 # Does not span the ROI
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
319 continue
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
320 cigar_ops = decode_cigar(cigar)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
321 if pos + align_len(cigar_ops) - 1 < end:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
322 # Does not span the ROI
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
323 continue
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
324 # All of start/end/pos are currently one-based, making offsets Python style....
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
325 roi_seq = get_roi(seq, cigar_ops, start - pos, end - pos + 1)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
326 assert roi_seq, "Error, empty ROI sequence for: %s" % line
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
327 try:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
328 tally[roi_seq] += 1
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
329 except KeyError:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
330 tally[roi_seq] = 1
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
331
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
332 stderr = child.stderr.read()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
333 child.stdout.close()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
334 child.stderr.close()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
335 return_code = child.wait()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
336 if return_code:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
337 sys.exit("Got return code %i from samtools view" % return_code)
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
338 elif "specifies an unknown reference name. Continue anyway." in stderr:
2
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
339 sys.exit(
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
340 stderr.strip()
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
341 + "\n\nERROR: samtools did not recognise the region requested, "
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
342 "can't count any variants."
167765a633c0 "Update all the pico_galaxy tools on main Tool Shed"
peterjc
parents: 1
diff changeset
343 )
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
344
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
345 return tally
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
346
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
347
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
348 def record_counts():
1
3ee6f4d0ac80 v0.0.5 Fix version inconsistency, using samtools v1.2 now.
peterjc
parents: 0
diff changeset
349 """Record counts to tabular file."""
0
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
350 tally = count_region()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
351 total = sum(tally.values())
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
352
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
353 # Using negative count to get sort with highest count first,
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
354 # while tie-breaking by the ROI sequence alphabetically.
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
355 table = sorted((-count, roi_seq) for (roi_seq, count) in tally.items())
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
356 del tally
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
357
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
358 with open(tabular_filename, "w") as handle:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
359 handle.write("Variant\tCount\tPercentage\n")
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
360 for count, roi_seq in table:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
361 handle.write("%s\t%i\t%0.2f\n" % (roi_seq, -count, -count * 100.0 / total))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
362
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
363 print("Counted %i variants from %i reads spanning %s" % (len(table), total, region))
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
364
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
365
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
366 # Run it!
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
367 record_counts()
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
368 # Remove the temp symlinks and files:
95efbdb72961 v0.0.4 - Previously only on Test Tool Shed
peterjc
parents:
diff changeset
369 clean_up()