Mercurial > repos > peterjc > count_roi_variants
diff tools/count_roi_variants/count_roi_variants.py @ 1:3ee6f4d0ac80 draft
v0.0.5 Fix version inconsistency, using samtools v1.2 now.
author | peterjc |
---|---|
date | Tue, 16 May 2017 09:14:18 -0400 |
parents | 95efbdb72961 |
children | 167765a633c0 |
line wrap: on
line diff
--- a/tools/count_roi_variants/count_roi_variants.py Wed Feb 01 07:10:26 2017 -0500 +++ b/tools/count_roi_variants/count_roi_variants.py Tue May 16 09:14:18 2017 -0400 @@ -12,9 +12,10 @@ the observed variants spanning the ROI, and outputs this as a tabular file. """ -import sys + import os import subprocess +import sys import tempfile if "-v" in sys.argv or "--version" in sys.argv: @@ -97,7 +98,9 @@ return answer -assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), (1, 'D'), (3, 'P'), (54, 'M'), (1, 'D'), (34, 'M'), (5, 'S')] +assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), + (1, 'D'), (3, 'P'), (54, 'M'), + (1, 'D'), (34, 'M'), (5, 'S')] def align_len(cigar_ops): @@ -138,16 +141,32 @@ else: raise NotImplementedError("Unexpected CIGAR operator %s" % op) + assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")] assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")] assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")] assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")] assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')] assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')] -assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] -assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] -assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, "c"), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] -assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), (3, 'A'), (3.5, 'c'), (4, 'G'), (5, 'G'), (6, 'G'), (7, 'G'), (7.5, 'c'), (8, 'T'), (9, 'T'), (10, 'T'), (11, 'T')] +assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), + (3, 'A'), (4, 'G'), (5, 'G'), + (6, 'G'), (7, 'G'), (8, 'T'), + (9, 'T'), (10, 'T'), (11, 'T')] +assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), + (3, 'A'), (3.5, 'c'), (4, 'G'), + (5, 'G'), (6, 'G'), (7, 'G'), + (8, 'T'), (9, 'T'), (10, 'T'), + (11, 'T')] +assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), + (3, 'A'), (4, 'G'), (5, 'G'), + (6, 'G'), (7, 'G'), (7.5, "c"), + (8, 'T'), (9, 'T'), (10, 'T'), + (11, 'T')] +assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), + (3, 'A'), (3.5, 'c'), (4, 'G'), + (5, 'G'), (6, 'G'), (7, 'G'), + (7.5, 'c'), (8, 'T'), (9, 'T'), + (10, 'T'), (11, 'T')] def get_roi(seq, cigar_ops, start, end): @@ -167,6 +186,7 @@ # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1) + assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8) assert "GGGG" == get_roi("AAAAcGGGGTTTT", decode_cigar("4M1I8M"), 4, 8) assert "GGGG" == get_roi("AAAAGGGGcTTTT", decode_cigar("8M1I4M"), 4, 8) @@ -176,6 +196,10 @@ def count_region(): + """Count reads mapped to the region. + + Calls samtools view and parses the output. + """ # Could recreate the region string (with no commas in start/end)? # region = "%s:%i-%i" % (ref, start, end) @@ -211,13 +235,15 @@ if return_code: sys.exit("Got return code %i from samtools view" % return_code) elif "specifies an unknown reference name. Continue anyway." in stderr: - sys.exit(stderr.strip() + "\n\nERROR: samtools did not recognise the region requested, can't count any variants.") + sys.exit(stderr.strip() + + "\n\nERROR: samtools did not recognise the region requested, " + "can't count any variants.") return tally def record_counts(): - + """Record counts to tabular file.""" tally = count_region() total = sum(tally.values())