changeset 2:167765a633c0 draft default tip

"Update all the pico_galaxy tools on main Tool Shed"
author peterjc
date Fri, 16 Apr 2021 22:34:00 +0000
parents 3ee6f4d0ac80
children
files tools/count_roi_variants/README.rst tools/count_roi_variants/count_roi_variants.py tools/count_roi_variants/count_roi_variants.xml tools/count_roi_variants/tool_dependencies.xml
diffstat 4 files changed, 152 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/tools/count_roi_variants/README.rst	Tue May 16 09:14:18 2017 -0400
+++ b/tools/count_roi_variants/README.rst	Fri Apr 16 22:34:00 2021 +0000
@@ -83,6 +83,7 @@
 v0.0.5  - Fix samtools dependency version inconsistency, using v1.2 now.
         - Use ``<command detect_errors="aggressive">`` (internal change only).
         - Single quote command line arguments (internal change only).
+v0.0.6  - Python 3 compatibility fix.
 ======= ======================================================================
 
 
--- a/tools/count_roi_variants/count_roi_variants.py	Tue May 16 09:14:18 2017 -0400
+++ b/tools/count_roi_variants/count_roi_variants.py	Fri Apr 16 22:34:00 2021 +0000
@@ -20,7 +20,7 @@
 
 if "-v" in sys.argv or "--version" in sys.argv:
     # Galaxy seems to invert the order of the two lines
-    print("BAM coverage statistics v0.0.4 (using samtools)")
+    print("BAM coverage statistics v0.0.6 (using samtools)")
     cmd = "samtools 2>&1 | grep -i ^Version"
     sys.exit(os.system(cmd))
 
@@ -37,7 +37,8 @@
 AGCCCATGAGATGGGAAGCAATGGGCTACA	14	87.50
 AGCCCATGAGATGGGAAGCAATGGGCTACG	1	6.25
 AGCGCATGAGATGGGAAGCAATGGGCTACG	1	6.25
-"""
+"""  # noqa: E501
+
 if len(sys.argv) == 5:
     bam_filename, bai_filename, tabular_filename, region = sys.argv[1:]
 else:
@@ -84,7 +85,7 @@
 
 
 def decode_cigar(cigar):
-    """Returns a list of 2-tuples, integer count and operator char."""
+    """Return a list of 2-tuples, integer count and operator char."""
     count = ""
     answer = []
     for letter in cigar:
@@ -98,30 +99,38 @@
     return answer
 
 
-assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'),
-                                                  (1, 'D'), (3, 'P'), (54, 'M'),
-                                                  (1, 'D'), (34, 'M'), (5, 'S')]
+assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [
+    (14, "S"),
+    (15, "M"),
+    (1, "P"),
+    (1, "D"),
+    (3, "P"),
+    (54, "M"),
+    (1, "D"),
+    (34, "M"),
+    (5, "S"),
+]
 
 
 def align_len(cigar_ops):
-    """Sums the CIGAR M/=/X/D/N operators."""
+    """Sum the CIGAR M/=/X/D/N operators."""
     return sum(count for count, op in cigar_ops if op in "M=XDN")
 
 
 def expand_cigar(seq, cigar_ops):
-    """Yields (ref_offset, seq_base) pairs."""
+    """Yield (ref_offset, seq_base) pairs."""
     ref_offset = 0
     seq_offset = 0
     for count, op in cigar_ops:
         if op in "MX=":
-            for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
+            for (i, base) in enumerate(seq[seq_offset : seq_offset + count]):
                 yield ref_offset + i, base
             ref_offset += count
             seq_offset += count
         elif op == "I":
             # Give them all an in-between reference position
             # (Python lets us mix integers and floats, wouldn't work in C)
-            for (i, base) in enumerate(seq[seq_offset:seq_offset + count]):
+            for (i, base) in enumerate(seq[seq_offset : seq_offset + count]):
                 yield ref_offset - 0.5, base
             # Does not change ref_offset
             seq_offset += count
@@ -142,31 +151,105 @@
             raise NotImplementedError("Unexpected CIGAR operator %s" % op)
 
 
-assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
-assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")]
-assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")]
-assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")]
-assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')]
-assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')]
-assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'),
-                                                                   (3, 'A'), (4, 'G'), (5, 'G'),
-                                                                   (6, 'G'), (7, 'G'), (8, 'T'),
-                                                                   (9, 'T'), (10, 'T'), (11, 'T')]
-assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'),
-                                                                       (3, 'A'), (3.5, 'c'), (4, 'G'),
-                                                                       (5, 'G'), (6, 'G'), (7, 'G'),
-                                                                       (8, 'T'), (9, 'T'), (10, 'T'),
-                                                                       (11, 'T')]
-assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'),
-                                                                       (3, 'A'), (4, 'G'), (5, 'G'),
-                                                                       (6, 'G'), (7, 'G'), (7.5, "c"),
-                                                                       (8, 'T'), (9, 'T'), (10, 'T'),
-                                                                       (11, 'T')]
-assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'),
-                                                                            (3, 'A'), (3.5, 'c'), (4, 'G'),
-                                                                            (5, 'G'), (6, 'G'), (7, 'G'),
-                                                                            (7.5, 'c'), (8, 'T'), (9, 'T'),
-                                                                            (10, 'T'), (11, 'T')]
+assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [
+    (0, "A"),
+    (1, "C"),
+    (2, "G"),
+    (3, "T"),
+]
+assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [
+    (0, "A"),
+    (1, "C"),
+    (2, "G"),
+    (3, "T"),
+]
+assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [
+    (0, "A"),
+    (1, "C"),
+    (3, "G"),
+    (4, "T"),
+]
+assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [
+    (0, "A"),
+    (1, "C"),
+    (1.5, "t"),
+    (2, "G"),
+    (3, "T"),
+]
+assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [
+    (-0.5, "t"),
+    (0, "A"),
+    (1, "C"),
+    (2, "G"),
+    (3, "T"),
+]
+assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [
+    (0, "A"),
+    (1, "C"),
+    (2, "G"),
+    (3, "T"),
+    (3.5, "t"),
+]
+assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [
+    (0, "A"),
+    (1, "A"),
+    (2, "A"),
+    (3, "A"),
+    (4, "G"),
+    (5, "G"),
+    (6, "G"),
+    (7, "G"),
+    (8, "T"),
+    (9, "T"),
+    (10, "T"),
+    (11, "T"),
+]
+assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [
+    (0, "A"),
+    (1, "A"),
+    (2, "A"),
+    (3, "A"),
+    (3.5, "c"),
+    (4, "G"),
+    (5, "G"),
+    (6, "G"),
+    (7, "G"),
+    (8, "T"),
+    (9, "T"),
+    (10, "T"),
+    (11, "T"),
+]
+assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [
+    (0, "A"),
+    (1, "A"),
+    (2, "A"),
+    (3, "A"),
+    (4, "G"),
+    (5, "G"),
+    (6, "G"),
+    (7, "G"),
+    (7.5, "c"),
+    (8, "T"),
+    (9, "T"),
+    (10, "T"),
+    (11, "T"),
+]
+assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [
+    (0, "A"),
+    (1, "A"),
+    (2, "A"),
+    (3, "A"),
+    (3.5, "c"),
+    (4, "G"),
+    (5, "G"),
+    (6, "G"),
+    (7, "G"),
+    (7.5, "c"),
+    (8, "T"),
+    (9, "T"),
+    (10, "T"),
+    (11, "T"),
+]
 
 
 def get_roi(seq, cigar_ops, start, end):
@@ -184,7 +267,9 @@
         return seq[start:end]
     # Would use "start <= i < end" if they were all integers, but
     # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8.
-    return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1)
+    return "".join(
+        base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1
+    )
 
 
 assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8)
@@ -203,15 +288,31 @@
     # Could recreate the region string (with no commas in start/end)?
     # region = "%s:%i-%i" % (ref, start, end)
 
-    tally = dict()
+    tally = {}
 
     # Call samtools view, don't need header so no -h added.
     # Only want mapped reads, thus flag filter -F 4.
-    child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region],
-                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    child = subprocess.Popen(
+        ["samtools", "view", "-F", "4", bam_file, region],
+        universal_newlines=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
     for line in child.stdout:
         assert line[0] != "@", "Got unexpected SAM header line: %s" % line
-        qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, rest = line.split("\t", 10)
+        (
+            qname,
+            flag,
+            rname,
+            pos,
+            mapq,
+            cigar,
+            rnext,
+            pnext,
+            tlen,
+            seq,
+            rest,
+        ) = line.split("\t", 10)
         pos = int(pos)  # one-based
         if start < pos:
             # Does not span the ROI
@@ -235,9 +336,11 @@
     if return_code:
         sys.exit("Got return code %i from samtools view" % return_code)
     elif "specifies an unknown reference name. Continue anyway." in stderr:
-        sys.exit(stderr.strip() +
-                 "\n\nERROR: samtools did not recognise the region requested, "
-                 "can't count any variants.")
+        sys.exit(
+            stderr.strip()
+            + "\n\nERROR: samtools did not recognise the region requested, "
+            "can't count any variants."
+        )
 
     return tally
 
--- a/tools/count_roi_variants/count_roi_variants.xml	Tue May 16 09:14:18 2017 -0400
+++ b/tools/count_roi_variants/count_roi_variants.xml	Fri Apr 16 22:34:00 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="count_roi_variants" name="Count sequence variants in region of interest" version="0.0.5">
+<tool id="count_roi_variants" name="Count sequence variants in region of interest" version="0.0.6">
     <description>using samtools view</description>
     <requirements>
         <requirement type="package" version="1.2">samtools</requirement>
@@ -96,7 +96,7 @@
 
 Heng Li et al (2009). The Sequence Alignment/Map format and SAMtools.
 Bioinformatics 25(16), 2078-9.
-http://dx.doi.org/10.1093/bioinformatics/btp352
+https://doi.org/10.1093/bioinformatics/btp352
 
 Peter J.A. Cock (2016), Count sequence variants in region of interest in BAM file.
 http://toolshed.g2.bx.psu.edu/view/peterjc/count_roi_variants
--- a/tools/count_roi_variants/tool_dependencies.xml	Tue May 16 09:14:18 2017 -0400
+++ b/tools/count_roi_variants/tool_dependencies.xml	Fri Apr 16 22:34:00 2021 +0000
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
     <package name="samtools" version="1.2">
-        <repository changeset_revision="f6ae3ba3f3c1" name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="f6ae3ba3f3c1"/>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file