Repository 'ngsutils_bam_filter'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/ngsutils_bam_filter

Changeset 0:4e4e4093d65d (2015-11-11)
Next changeset 1:8187a729d9f4 (2015-12-06)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
added:
bam_filter.xml
filter.py
macros.xml
ngsutils/__init__.py
ngsutils/__init__.pyc
ngsutils/bam/__init__.py
ngsutils/bam/__init__.pyc
ngsutils/bed/__init__.py
ngsutils/bed/__init__.pyc
ngsutils/support/__init__.py
ngsutils/support/__init__.pyc
ngsutils/support/bgzip.py
ngsutils/support/dbsnp.py
ngsutils/support/dbsnp.pyc
ngsutils/support/llh.py
ngsutils/support/ngs_utils.py
ngsutils/support/ngs_utils.pyc
ngsutils/support/regions.py
ngsutils/support/stats.py
test-data/ngsutils_bam_filter_input1.bam
test-data/ngsutils_bam_filter_result1.bam
test-data/ngsutils_bam_filter_result2.bam
test-data/ngsutils_bam_filter_result3.bam
test-data/ngsutils_bam_filter_result4.bam
tool_dependencies.xml
b
diff -r 000000000000 -r 4e4e4093d65d bam_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bam_filter.xml Wed Nov 11 13:04:07 2015 -0500
[
b'@@ -0,0 +1,231 @@\n+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.0">\n+    <description>Removes reads from a BAM file based on criteria</description>\n+    <macros>\n+        <import>macros.xml</import>\n+    </macros>\n+    <expand macro="requirements" />\n+    <expand macro="stdio" />\n+    <expand macro="version" />\n+    <command><![CDATA[\n+    ## If the tool is executed with no filtering option,\n+    ## the default parameters simply copy over the input file\n+    if grep -q "\\w" ${parameters};\n+    then\n+        $__tool_directory__/filter.py\n+        $infile\n+        $outfile\n+        `cat ${parameters}`;\n+    else\n+        cp $infile $outfile;\n+    fi\n+]]>\n+    </command>\n+    <configfiles>\n+        <configfile name="parameters">\n+<![CDATA[\n+        #if $minlen:\n+            -minlen $minlen\n+        #end if\n+        #if $maxlen\n+            -maxlen $maxlen\n+        #end if\n+        $mapped\n+        $unmapped\n+        $properpair\n+        $noproperpair\n+        #if $mask:\n+            -mask "${mask}"\n+        #end if\n+        #if int($uniq) > -1:\n+            -uniq\n+            #if int($uniq) > 0:\n+                $uniq\n+            #end if\n+        #end if\n+        $uniq_start\n+        #if $mismatch:\n+            -mismatch $mismatch\n+        #end if\n+        $nosecondary\n+        $noqcfail\n+        $nopcrdup\n+        #if $excludebed:\n+            -excludebed "${excludebed}" $ignore_strand\n+        #end if\n+        #if $includebed:\n+            -includebed "${includebed}" $ignore_strand\n+        #end if\n+        #if $includeref:\n+            -includeref "${includeref}"\n+        #end if\n+        #if $excluderef:\n+            -excluderef "${excluderef}"\n+        #end if\n+        #if $maximum_mismatch_ratio\n+            -maximum_mismatch_ratio $maximum_mismatch_ratio\n+        #end if\n+    ]]>\n+        </configfile>\n+    </configfiles>\n+    <inputs>\n+        <param name="infile" type="data" format="bam" label="Select BAM dataset" />\n+        <param argument="-minlen" type="integer" value="" optional="True" min="0"\n+            label="Remove reads that are smaller than"\n+            help="in bp"/>\n+        <param argument="-maxlen" type="integer" value="" optional="True" min="0"\n+            label="Remove reads that are larger than"\n+            help="in bp"/>\n+        <param argument="-mapped" truevalue="-mapped" type="boolean" falsevalue=""\n+            label="Keep only mapped reads"\n+            help="" />\n+        <param argument="-unmapped" truevalue="-unmapped" type="boolean" falsevalue=""\n+            label="Keep only unmapped reads"\n+            help="" />\n+        <param argument="-properpair" truevalue="-properpair" type="boolean" falsevalue=""\n+            label="Keep only properly paired reads"\n+            help="both mapped, correct orientation, flag set in BAM" />\n+        <param argument="-noproperpair" truevalue="-noproperpair" type="boolean" falsevalue=""\n+            label="Discard properly paired reads"\n+            help="" />\n+        <param argument="-mask" type="text" value="" optional="True" label="Remove reads that match the mask"\n+            help="e.g. 0x400, 0x2" />\n+\n+        <param argument="-uniq" type="integer" value="-1" optional="True" min="-1"\n+            label="Remove reads that have the same sequence"\n+            help="up to the Nth nucleotide starting from the 5prime end. -1 means do not filter, 0 means check along the entire read,\n+                10 would make filter reads that are unique over the first 10 nucleotides."/>\n+\n+        <param argument="-uniq_start" truevalue="-uniq_start" type="boolean" falsevalue=""\n+            label="Remove reads that start at the same position"\n+            help="Use only for low-coverage samples" />\n+\n+        <param argument="-mismatch" type="integer" value="" optional="True" min="0"\n+            label="Remove reads with that many mismatches"\n+            help="Indels always counts as 1 regardless of length. Requires NM tag to b'..b'--------------+\n+| -maxlen val                    | Remove reads that are larger than {val}         |\n++--------------------------------+-------------------------------------------------+\n+| -mapped                        | Keep only mapped reads                          |\n++--------------------------------+-------------------------------------------------+\n+| -unmapped                      | Keep only unmapped reads                        |\n++--------------------------------+-------------------------------------------------+\n+| -properpair                    | Keep only properly paired reads (both mapped,   |\n+|                                | correct orientation, flag set in BAM)           |\n++--------------------------------+-------------------------------------------------+\n+| -noproperpair                  | Keep only not-properly paired reads             |\n++--------------------------------+-------------------------------------------------+\n+| -mask bitmask                  | Remove reads that match the mask (e.g. 0x400)   |\n++--------------------------------+-------------------------------------------------+\n+| -uniq {length}                 | Remove reads that are have the same sequence    |\n+|                                | Note: BAM file should be sorted                 |\n+|                                | (up to an optional length)                      |\n++--------------------------------+-------------------------------------------------+\n+| -uniq_start                    | Remove reads that start at the same position    |\n+|                                | Note: BAM file should be sorted                 |\n+|                                | (Use only for low-coverage samples)             |\n++--------------------------------+-------------------------------------------------+\n+|-mismatch num                   | Number of mismatches or indels                  |\n+|                                | indel always counts as 1 regardless of length   |\n+|                                | (requires NM tag in reads)                      |\n++--------------------------------+-------------------------------------------------+\n+|-nosecondary                    | Remove reads that have the 0x100 flag set       |\n++--------------------------------+-------------------------------------------------+\n+|-noqcfail                       | Remove reads that have the 0x200 flag set       |\n++--------------------------------+-------------------------------------------------+\n+|-nopcrdup                       | Remove reads that have the 0x400 flag set       |\n++--------------------------------+-------------------------------------------------+\n+|-excludebed file.bed {nostrand} | Remove reads that are in any of the regions     |\n+|                                | from the given BED file. If \'nostrand\' is given,|\n+|                                | strand information from the BED file is ignored.|\n++--------------------------------+-------------------------------------------------+\n+|-includebed file.bed {nostrand} | Remove reads that are NOT any of the regions    |\n+|                                | from the given BED file. If \'nostrand\' is given,|\n+|                                | strand information from the BED file is ignored.|\n+|                                | Note: If this is a large dataset, use           |\n+|                                | "bamutils extract" instead.                     |\n++--------------------------------+-------------------------------------------------+\n+| -includeref refname            | Exclude reads NOT mapped to a reference         |\n++--------------------------------+-------------------------------------------------+\n+| -excluderef refname            | Exclude reads mapped to a particular reference  |\n+|                                | (e.g. chrM, or _dup chromosomes)                |\n++--------------------------------+-------------------------------------------------+\n+]]>\n+    </help>\n+</tool>\n'
b
diff -r 000000000000 -r 4e4e4093d65d filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.py Wed Nov 11 13:04:07 2015 -0500
[
b'@@ -0,0 +1,986 @@\n+#!/usr/bin/env python\n+## category General\n+## desc Removes reads from a BAM file based on criteria\n+"""\n+Removes reads from a BAM file based on criteria\n+\n+Given a BAM file, this script will only allow reads that meet filtering\n+criteria to be written to output. The output is another BAM file with the\n+reads not matching the criteria removed.\n+\n+Note: this does not adjust tag values reflecting any filtering. (for example:\n+      if a read mapped to two locations (IH:i:2), and one was removed by\n+      filtering, the IH:i tag would still read IH:i:2).\n+\n+Currently, the available filters are:\n+    -minlen val                Remove reads that are smaller than {val}\n+    -maxlen val                Remove reads that are larger than {val}\n+    -mapped                    Keep only mapped reads\n+    -unmapped                  Keep only unmapped reads\n+    -properpair                Keep only properly paired reads (both mapped, \n+                               correct orientation, flag set in BAM)\n+    -noproperpair              Keep only not-properly paired reads\n+\n+    -mask bitmask              Remove reads that match the mask (base 10/hex)\n+    -uniq {length}             Remove reads that are have the same sequence\n+                               Note: BAM file should be sorted\n+                               (up to an optional length)\n+    -uniq_start                Remove reads that start at the same position\n+                               Note: BAM file should be sorted\n+                               (Use only for low-coverage samples)\n+\n+    -mismatch num              # mismatches or indels\n+                               indel always counts as 1 regardless of length\n+                               (requires NM tag in reads)\n+\n+    -mismatch_dbsnp num dbsnp.txt.bgz\n+                               # mismatches or indels - not in dbSNP.\n+                               Variations are called using the MD tag.\n+                               Variations that are found in the dbSNP list are\n+                               not counted as mismatches. The dbSNP list is a\n+                               Tabix-indexed dump of dbSNP (from UCSC Genome\n+                               Browser). Indels in dbSNP are also counted.\n+                               Adds a \'ZS:i\' tag with the number of found SNPs\n+                               in the read.\n+                               (requires NM and MD tags)\n+\n+                               Example command for indexing:\n+                               ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0\n+\n+    -mismatch_ref num ref.fa   # mismatches or indel - looks up mismatches\n+                               directly in a reference FASTA file\n+                               (use if NM tag not present)\n+\n+    -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz\n+                               # mismatches or indels - looks up mismatches\n+                               directly from a reference FASTA file. (See\n+                               -mismatch_dbsnp for dbSNP matching)\n+                               (use if NM or MD tag not present)\n+\n+    -nosecondary               Remove reads that have the 0x100 flag set\n+    -noqcfail                  Remove reads that have the 0x200 flag set\n+    -nopcrdup                  Remove reads that have the 0x400 flag set\n+\n+\n+    -exclude ref:start-end     Remove reads in this region (1-based start)\n+    -excludebed file.bed {nostrand}\n+                               Remove reads that are in any of the regions\n+                               from the given BED file. If \'nostrand\' is given,\n+                               strand information from the BED file is ignored.\n+\n+    -include ref:start-end     Remove reads NOT in the region (can only be one)\n+    -includebed file.bed {nostrand}\n+                               Remove reads that are NOT any of the regions\n+                               from the given BED file. If \'nostrand\' is given'..b'   if failedfile:\n+        failed_out = open(failedfile, \'w\')\n+    else:\n+        failed_out = None\n+\n+    passed = 0\n+    failed = 0\n+\n+    def _callback(read):\n+        return "%s | %s kept,%s failed" % (\'%s:%s\' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else \'unk\', passed, failed)\n+\n+    for read in bam_iter(bamfile, quiet=True):\n+        p = True\n+\n+        for criterion in criteria:\n+            if not criterion.filter(bamfile, read):\n+                p = False\n+                failed += 1\n+                if failed_out:\n+                    failed_out.write(\'%s\\t%s\\n\' % (read.qname, criterion))\n+                #outfile.write(read_to_unmapped(read))\n+                break\n+        if p:\n+            passed += 1\n+            outfile.write(read)\n+\n+    bamfile.close()\n+    outfile.close()\n+    if failed_out:\n+        failed_out.close()\n+    sys.stdout.write("%s kept\\n%s failed\\n" % (passed, failed))\n+\n+    for criterion in criteria:\n+        criterion.close()\n+\n+\n+def read_to_unmapped(read):\n+    \'\'\'\n+    Example unmapped read\n+    2358_2045_1839    |  4 | *                             |  0    |  0   | *                  | *  |  0 |  0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC           | 0QT\\VNO^IIRJKXTIHIKTY\\STZZ[XOJKPWLHJQQQ^XPQYIIRQII           | PG:Z:bfast   | AS:i:-                     | NH:i:0   | IH:i:1   | HI:i:1                                     | CS:Z:T10213222020221330022203222011113021130222212230122             | CQ:Z:019=2+><%@.\'>52%)\'15?90<7;:5+(-49(\'-7-<>5-@5%<.2%=            | CM:i:0   | XA:i:4\n+\n+    Example mapped read:\n+    2216_1487_198     |  16 | chr11:19915291-19925392      |  5531 |   12 | 24M2D26M           | *  |  0 |  0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA           | 1C8FZ`M""""""WZ""%"#\\I;"`R@D]``ZZ``\\QKX\\Y]````IK^`           | PG:Z:bfast   | AS:i:1150   | NM:i:2   | NH:i:10   | IH:i:1   | HI:i:1   | MD:Z:24^CT26                   | CS:Z:T00103022001002113210023031213331122121223321221122             | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1            | CM:i:9   | XA:i:4   | XE:Z:-------23322---2-11----2----------------------------\n+\n+    \'\'\'\n+    # TODO: rewrite read properties to unmapped state\n+    #       if colorspace: convert CS to NT directly\n+    #       remove excess tags\n+\n+    read.is_unmapped = True\n+    read.tid = None\n+    read.pos = 0\n+    read.mapq = 0\n+    return read\n+\n+if __name__ == \'__main__\':\n+    infile = None\n+    outfile = None\n+    failed = None\n+    criteria = []\n+\n+    crit_args = []\n+    last = None\n+    verbose = False\n+    fail = False\n+\n+    for arg in sys.argv[1:]:\n+        if last == \'-failed\':\n+            failed = arg\n+            last = None\n+        elif arg == \'-h\':\n+            usage()\n+        elif arg == \'-failed\':\n+            last = arg\n+        elif arg == \'-v\':\n+            verbose = True\n+        elif not infile and os.path.exists(arg):\n+            infile = arg\n+        elif not outfile:\n+            outfile = arg\n+        elif arg[0] == \'-\':\n+            if not arg[1:] in _criteria:\n+                print "Unknown criterion: %s" % arg\n+                fail = True\n+            if crit_args:\n+                criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))\n+            crit_args = [arg, ]\n+        elif crit_args:\n+            crit_args.append(arg)\n+        else:\n+            print "Unknown argument: %s" % arg\n+            fail = True\n+\n+    if not fail and crit_args:\n+        criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))\n+\n+    if fail or not infile or not outfile or not criteria:\n+        if not infile and not outfile and not criteria:\n+            usage()\n+\n+        if not infile:\n+            print "Missing: input bamfile"\n+        if not outfile:\n+            print "Missing: output bamfile"\n+        if not criteria:\n+            print "Missing: filtering criteria"\n+        usage()\n+    else:\n+        bam_filter(infile, outfile, criteria, failed, verbose)\n'
b
diff -r 000000000000 -r 4e4e4093d65d macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Wed Nov 11 13:04:07 2015 -0500
b
@@ -0,0 +1,16 @@
+<macros>
+    <token name="@WRAPPER_VERSION@">0.5.7</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.7.7">pysam</requirement>
+        </requirements>
+    </xml>
+    <xml name="version">
+        <version_command>@WRAPPER_VERSION@</version_command>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" level="fatal" />
+        </stdio>
+    </xml>
+</macros>
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.pyc
b
Binary file ngsutils/__init__.pyc has changed
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bam/__init__.py Wed Nov 11 13:04:07 2015 -0500
[
b"@@ -0,0 +1,879 @@\n+import sys\n+import os\n+import re\n+import pysam\n+try:\n+    from eta import ETA\n+except:\n+    pass\n+import ngsutils.support\n+\n+\n+def bam_open(fname, mode='r', *args, **kwargs):\n+    if fname.lower()[-4:] == '.bam':\n+        return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)\n+    return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)\n+\n+\n+def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):\n+    if not quiet and bam.filename:\n+        eta = ETA(os.stat(bam.filename).st_size)\n+    else:\n+        eta = None\n+\n+    for pileup in bam.pileup(mask=mask):\n+        pos = bam.tell()\n+        bgz_offset = pos >> 16\n+\n+        if not quiet:\n+            if callback:\n+                eta.print_status(bgz_offset, extra=callback(pileup))\n+            else:\n+                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))\n+\n+        yield pileup\n+\n+    if eta:\n+        eta.done()\n+\n+\n+def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):\n+    '''\n+    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]\n+    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']\n+    '''\n+\n+    if os.path.exists('%s.bai' % bam.filename):\n+        # This is an indexed file, so it is ref sorted...\n+        # Meaning that we should show chrom:pos, instead of read names\n+        show_ref_pos = True\n+\n+    eta = None\n+\n+    if not ref:\n+        if not quiet and bam.filename:\n+            eta = ETA(os.stat(bam.filename).st_size)\n+\n+        for read in bam:\n+            pos = bam.tell()\n+            bgz_offset = pos >> 16\n+\n+            if not quiet and eta:\n+                if callback:\n+                    eta.print_status(bgz_offset, extra=callback(read))\n+                elif (show_ref_pos):\n+                    if read.tid > -1:\n+                        eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n+                    else:\n+                        eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))\n+                else:\n+                    eta.print_status(bgz_offset, extra='%s' % read.qname)\n+\n+            yield read\n+\n+    else:\n+        working_chrom = None\n+        if ref in bam.references:\n+            working_chrom = ref\n+        elif ref[0:3] == 'chr':\n+            # compensate for Ensembl vs UCSC ref naming\n+            if ref[3:] in bam.references:\n+                working_chrom = ref[3:]\n+\n+        if not working_chrom:\n+            raise ValueError('Missing reference: %s' % ref)\n+\n+        tid = bam.gettid(working_chrom)\n+\n+        if not start:\n+            start = 0\n+        if not end:\n+            end = bam.lengths[tid]\n+\n+        if not quiet and bam.filename:\n+            eta = ETA(end - start)\n+\n+        for read in bam.fetch(working_chrom, start, end):\n+            if not quiet and eta:\n+                if callback:\n+                    eta.print_status(read.pos - start, extra=callback(read))\n+                else:\n+                    eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n+\n+            yield read\n+\n+    if eta:\n+        eta.done()\n+\n+\n+def bam_batch_reads(bam, quiet=False):\n+    '''\n+    Batch mapping for the same reads (qname) together, this way\n+    they can all be compared/converted together.\n+    '''\n+    reads = []\n+    last = None\n+    for read in bam_iter(bam, quiet=quiet):\n+        if last and read.qname != last:\n+            yield reads\n+            reads = []\n+        last = read.qname\n+        reads.append(read)\n+\n+    if reads:\n+        yield reads\n+\n+\n+bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']\n+bam_cigar_op = {\n+    'M': 0,\n+    'I': 1,\n+    'D': 2,\n+    'N': 3,\n+    'S': 4,\n+    'H': 5,\n+    'P': 6,\n+    '=': 7,\n+    'X': 8,\n+}\n+\n+\n+def cigar_fromstr(s):\n+    '''\n+    >>> cigar_fromstr('10M5I20M')\n+    [(0, 10), (1, 5), (0,"..b'n True, \'\'\n+\n+\n+def read_alignment_fragments_gen(read):\n+    \'\'\'\n+    Takes a read and returns the start and end positions for each match/mismatch region.\n+    This will let us know where each read alignment "touches" the genome.\n+    \'\'\'\n+\n+    for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):\n+        yield (start, end)\n+\n+\n+def _read_alignment_fragments_gen(pos, cigar):\n+    \'\'\' Test-able version of read_alignment_fragments_gen\n+    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'50M\')))\n+    [(1, 51)]\n+\n+    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'25M100N25M\')))\n+    [(1, 26), (126, 151)]\n+\n+    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'20M1D4M100N10M5I10M\')))\n+    [(1, 21), (22, 26), (126, 136), (136, 146)]\n+    \'\'\'\n+    ref_pos = pos\n+\n+    for op, length in cigar:\n+        if op == 0:\n+            yield (ref_pos, ref_pos + length)\n+            ref_pos += length\n+        elif op == 1:\n+            pass\n+        elif op == 2:\n+            ref_pos += length\n+        elif op == 3:\n+            ref_pos += length\n+        else:\n+            raise ValueError("Unsupported CIGAR operation: %s" % op)\n+\n+\n+def read_cigar_at_pos(cigar, qpos, is_del):\n+    \'\'\'\n+    Returns the CIGAR operation for a given read position\n+\n+    qpos is the 0-based index of the base in the read\n+    \'\'\'\n+    pos = 0\n+    returnnext = False\n+    for op, length in cigar:\n+        if returnnext:\n+            return op\n+        if op == 0:\n+            pos += length\n+        elif op == 1:\n+            pos += length\n+        elif op == 2:\n+            pass\n+        elif op == 3:\n+            pass\n+        elif op == 4:\n+            pos += length\n+        elif op == 5:\n+            pass\n+        else:\n+            raise ValueError("Unsupported CIGAR operation: %s" % op)\n+\n+        if pos > qpos:\n+            return op\n+        if is_del and pos == qpos:\n+            returnnext = True\n+\n+    return None\n+\n+\n+def cleancigar(cigar):\n+    \'\'\'\n+    Cleans a CIGAR alignment to remove zero length operations\n+\n+    >>> cigar_tostr(cleancigar(cigar_fromstr(\'31M0N52M18S\')))\n+    \'83M18S\'\n+\n+    \'\'\'\n+    newcigar = []\n+\n+    changed = False\n+    zero = False\n+\n+    for op, size in cigar:\n+        if size > 0:\n+            if zero and newcigar:\n+                if newcigar[-1][0] == op:\n+                    newsize = newcigar[-1][1] + size\n+                    newcigar[-1] = (op, newsize)\n+                zero = 0\n+            else:\n+                newcigar.append((op, size))\n+        else:\n+            changed = True\n+            zero = True\n+\n+    if changed:\n+        return newcigar\n+\n+    return None\n+\n+\n+def read_cleancigar(read):\n+    \'\'\'\n+    Replaces the CIGAR string for a read to remove any operations that are zero length.\n+\n+    This happens to the read in-place\n+    \'\'\'\n+    if read.is_unmapped:\n+        return False\n+\n+    newcigar = []\n+\n+    newcigar = cleancigar(read.cigar)\n+\n+    if newcigar:\n+        read.cigar = newcigar\n+        return True\n+\n+    return False\n+\n+\n+def read_to_unmapped(read, ref=None):\n+    \'\'\'\n+    Converts a read from mapped to unmapped.\n+\n+    Sets the \'ZR\' tag to indicate the original ref/pos/cigar (if ref is passed)\n+    \'\'\'\n+\n+    newread = pysam.AlignedRead()\n+\n+    if ref:\n+        tags = [(\'ZR\', \'%s:%s:%s\' % (ref, read.pos, cigar_tostr(read.cigar)))]\n+\n+    newread.is_unmapped = True\n+    newread.mapq = 0\n+    newread.tlen = 0\n+    newread.pos = -1\n+    newread.pnext = -1\n+    newread.rnext = -1\n+    newread.tid = -1\n+\n+    newread.qname = read.qname\n+\n+    if read.is_paired:\n+        newread.is_paired = True\n+\n+    if not read.is_unmapped and read.is_reverse:\n+        newread.seq = ngsutils.support.revcomp(read.seq)\n+        newread.qual = read.qual[::-1]\n+    else:        \n+        newread.seq = read.seq\n+        newread.qual = read.qual\n+\n+    newread.tags = tags\n+\n+    return newread\n+\n+\n+\n+if __name__ == \'__main__\':\n+    import doctest\n+    doctest.testmod()\n'
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.pyc
b
Binary file ngsutils/bam/__init__.pyc has changed
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bed/__init__.py Wed Nov 11 13:04:07 2015 -0500
[
b'@@ -0,0 +1,282 @@\n+import os\n+import ngsutils.support.ngs_utils\n+import pysam\n+\n+\n+class BedStreamer(object):\n+    \'\'\'\n+    Streams BedRegions from a BED file\n+\n+    Note - this can only be used once! There is no mechanism to seek the stream.\n+    \'\'\'\n+\n+    def __init__(self, fname=None, fileobj=None, quiet=False):\n+        if not fname and not fileobj:\n+            raise ValueError("You must specify either fname or fileobj!")\n+\n+        self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)\n+\n+    def __iter__(self):\n+        return self\n+\n+    def next(self):\n+        try:\n+            while True:\n+                line = self.reader.next().strip()\n+                if line and line[0] != \'#\':\n+                    cols = line.split(\'\\t\')\n+                    while len(cols) < 6:\n+                        cols.append(\'\')\n+\n+                    return BedRegion(*cols)\n+        except:\n+            raise StopIteration\n+\n+\n+\n+class BedFile(object):\n+    \'\'\'\n+    BED files are read in their entirety memory, in a series of bins. Each bin\n+    is ~100kb in size. Each bin can then be iterated over.\n+\n+    This is less efficient than using a proper index, but in reality, this\n+    usually isn\'t an issue. However, if the BED file has been Tabix indexed,\n+    that index will be used for random access.\n+\n+    NOTE: This isn\'t very efficient, so perhaps this can be remodeled into a BedFile\n+    and a BedFileIndex where the file is indexed only if random access is requested.\n+    \'\'\'\n+\n+    _bin_const = 100000\n+\n+    def __init__(self, fname=None, fileobj=None, region=None):\n+        self._bins = {}\n+        self._bin_list = []\n+        self._cur_bin_idx = 0\n+        self._cur_bin_pos = 0\n+        self._tellpos = 0\n+        self._total = 0\n+        self._length = 0\n+        self.__tabix = None\n+\n+        self.filename = fname\n+\n+        if os.path.exists(\'%s.tbi\' % fname):\n+            self.__tabix = pysam.Tabixfile(fname)\n+\n+        if fileobj:\n+            self.__readfile(fileobj)\n+        elif fname:\n+            with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:\n+                self.__readfile(fobj)\n+        elif region:\n+            chrom, startend = region.split(\':\')\n+            if \'-\' in startend:\n+                start, end = [int(x) for x in startend.split(\'-\')]\n+            else:\n+                start = int(startend)\n+                end = start\n+            start -= 1\n+\n+            self.__add_region(BedRegion(chrom, start, end))\n+        else:\n+            raise ValueError("Must specify either filename, fileobj, or region")\n+\n+    def __readfile(self, fobj):\n+        for line in fobj:\n+            line = line.strip()\n+            if line and line[0] != \'#\':\n+                cols = line.split(\'\\t\')\n+                while len(cols) < 6:\n+                    cols.append(\'\')\n+\n+                region = BedRegion(*cols)\n+                self.__add_region(region)\n+\n+        self._bin_list.sort()\n+        for bin in self._bins:\n+            self._bins[bin].sort()\n+\n+    def __add_region(self, region):\n+        self._total += region.end - region.start\n+        self._length += 1\n+\n+        startbin = region.start / BedFile._bin_const\n+        endbin = region.end / BedFile._bin_const\n+\n+        for bin in xrange(startbin, endbin + 1):\n+            if not (region.chrom, bin) in self._bins:\n+                self._bin_list.append((region.chrom, bin))\n+                self._bins[(region.chrom, bin)] = []\n+            self._bins[(region.chrom, bin)].append(region)\n+\n+    def fetch(self, chrom, start, end, strand=None):\n+        \'\'\'\n+        For TABIX indexed BED files, find all regions w/in a range\n+\n+        For non-TABIX index BED files, use the calculated bins, and\n+        output matching regions\n+        \'\'\'\n+\n+        if self.__tabix:\n+            for match in self.__tabix.fetch(chrom, start, end):\n+                region = BedRegion(*match.split(\'\\t\'))\n+                if not strand or (strand'..b"\n+\n+    def close(self):\n+        pass\n+\n+    @property\n+    def length(self):\n+        return self._length\n+\n+    @property\n+    def total(self):\n+        return self._total\n+\n+    def __iter__(self):\n+        self._cur_bin_idx = 0\n+        self._cur_bin_pos = 0\n+        self._tellpos = 0\n+        return self\n+\n+    def next(self):\n+        if self._cur_bin_idx >= len(self._bin_list):\n+            raise StopIteration\n+\n+        binvals = self._bins[self._bin_list[self._cur_bin_idx]]\n+        while self._cur_bin_pos < len(binvals):\n+            val = binvals[self._cur_bin_pos]\n+            self._cur_bin_pos += 1\n+\n+            startbin = (val.chrom, val.start / BedFile._bin_const)\n+            if startbin == self._bin_list[self._cur_bin_idx]:\n+                self._tellpos += 1\n+                return val\n+\n+        self._cur_bin_idx += 1\n+        self._cur_bin_pos = 0\n+        return self.next()\n+\n+\n+class BedRegion(object):\n+    def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):\n+        self.chrom = chrom\n+        self.start = int(start)\n+        self.end = int(end)\n+        self.name = name\n+\n+        if score == '':\n+            self.score = 0\n+        else:\n+            self.score = float(score)\n+\n+        if strand == '':\n+            self.strand = None\n+        else:\n+            self.strand = strand\n+\n+        if thickStart == '':\n+            self.thickStart = None\n+        else:\n+            self.thickStart = thickStart\n+\n+        if thickEnd == '':\n+            self.thickEnd = None\n+        else:\n+            self.thickEnd = thickEnd\n+\n+        if rgb == '':\n+            self.rgb = None\n+        else:\n+            self.rgb = rgb\n+\n+        self.extras = args\n+\n+    def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):\n+        cols = []\n+        cols.append(self.chrom if chrom is None else chrom)\n+        cols.append(self.start if start is None else start)\n+        cols.append(self.end if end is None else end)\n+        cols.append(self.name if name is None else name)\n+        cols.append(self.score if score is None else score)\n+        cols.append(self.strand if strand is None else strand)\n+        cols.append(self.thickStart if thickStart is None else thickStart)\n+        cols.append(self.thickEnd if thickEnd is None else thickEnd)\n+        cols.append(self.rgb if rgb is None else rgb)\n+\n+        for i, val in enumerate(self.extras):\n+            if len(args) > i:\n+                cols.append(args[i])\n+            else:\n+                cols.append(val)\n+\n+        return BedRegion(*cols)\n+\n+    @property\n+    def score_int(self):\n+        score = str(self.score)\n+        if score[-2:] == '.0':\n+            score = score[:-2]\n+\n+        return score\n+\n+    def __key(self):\n+        return (self.chrom, self.start, self.end, self.strand, self.name)\n+\n+    def __lt__(self, other):\n+        return self.__key() < other.__key()\n+\n+    def __gt__(self, other):\n+        return self.__key() > other.__key()\n+\n+    def __eq__(self, other):\n+        return self.__key() == other.__key()\n+\n+    def write(self, out):\n+        out.write('%s\\n' % self)\n+\n+    def __repr__(self):\n+        outcols = []\n+\n+        if self.rgb:\n+            outcols.append(self.rgb)\n+        if self.thickEnd or outcols:\n+            outcols.append(self.thickEnd if self.thickEnd else self.end)\n+        if self.thickStart or outcols:\n+            outcols.append(self.thickStart if self.thickStart else self.start)\n+        if self.strand or outcols:\n+            outcols.append(self.strand)\n+        if self.score_int != '' or outcols:\n+            outcols.append(self.score_int)\n+        if self.name or outcols:\n+            outcols.append(self.name)\n+\n+        outcols.append(self.end)\n+        outcols.append(self.start)\n+        outcols.append(self.chrom)\n+\n+        return '\\t'. join([str(x) for x in outcols[::-1]])\n"
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.pyc
b
Binary file ngsutils/bed/__init__.pyc has changed
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/__init__.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,249 @@
+import collections
+import gzip
+import os
+import sys
+import re
+try:
+    from eta import ETA
+except:
+    pass
+
+class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
+    def __repr__(self):
+        if self.comment:
+            return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
+        return '>%s\n%s\n' % (self.name, self.seq)
+
+    def subseq(self, start, end, comment=None):
+        if self.comment:
+            comment = '%s %s' % (self.comment, comment)
+
+        return FASTARead(self.name, comment, self.seq[start:end])
+
+    def clone(self, name=None, comment=None, seq=None):
+        n = name if name else self.name
+        c = comment if comment else self.comment
+        s = seq if seq else self.seq
+
+        return FASTARead(n, c, s)
+
+    def write(self, out):
+        out.write(repr(self))
+
+
+class FASTA(object):
+    def __init__(self, fname=None, fileobj=None, qual=False):
+        self.fname = fname
+        self.qual = qual
+        if fileobj:
+            self.fileobj = fileobj
+        else:
+            if self.fname == '-':
+                self.fileobj = sys.stdin
+            elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
+                self.fileobj = gzip.open(os.path.expanduser(self.fname))
+            else:
+                self.fileobj = open(os.path.expanduser(self.fname))
+
+        if not self.fileobj:
+            raise ValueError("Missing valid filename or fileobj")
+
+    def close(self):
+        if self.fileobj != sys.stdout:
+            self.fileobj.close()
+
+    def tell(self):
+        # always relative to uncompressed...
+        return self.fileobj.tell()
+
+    def seek(self, pos, whence=0):
+        self.fileobj.seek(pos, whence)
+
+    def fetch(self, quiet=False):
+        name = ''
+        comment = ''
+        seq = ''
+
+        if not quiet and self.fname and self.fname != '-':
+            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
+        else:
+            eta = None
+
+        for line in self.fileobj:
+            line = line.strip()
+            if not line:
+                continue
+            if line[0] == '#':
+                continue
+
+            if line[0] == '>':
+                if name and seq:
+                    if eta:
+                        eta.print_status(extra=name)
+                    yield FASTARead(name, comment, seq)
+
+                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
+                name = spl[0]
+                if len(spl) > 1:
+                    comment = spl[1]
+                else:
+                    comment = ''
+                seq = ''
+
+            else:
+                if self.qual:
+                    seq = seq + ' ' + line
+                else:
+                    seq += line
+
+        if name and seq:
+            if eta:
+                eta.print_status(extra=name)
+            yield FASTARead(name, comment, seq)
+
+        if eta:
+            eta.done()
+
+
+def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
+    if fileobj:
+        f = fileobj
+    elif fname == '-':
+        f = sys.stdin
+    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+        f = gzip.open(os.path.expanduser(fname))
+    else:
+        f = open(os.path.expanduser(fname))
+
+    if quiet or fname == '-':
+        eta = None
+    else:
+        eta = ETA(os.stat(fname).st_size, fileobj=f)
+
+    for line in f:
+        if eta:
+            if callback:
+                extra = callback()
+            else:
+                extra = ''
+
+            eta.print_status(extra=extra)
+        yield line
+
+        if done_callback and done_callback():
+                break
+
+    if f != sys.stdin:
+        f.close()
+
+    if eta:
+        eta.done()
+
+
+class Symbolize(object):
+    'Converts strings to symbols - basically a cache of strings'
+    def __init__(self):
+        self.__cache = {}
+
+    def __getitem__(self, k):
+        if not k in self.__cache:
+            self.__cache[k] = k
+
+        return self.__cache[k]
+
+symbols = Symbolize()
+
+_compliments = {
+'a': 't',
+'A': 'T',
+'c': 'g',
+'C': 'G',
+'g': 'c',
+'G': 'C',
+'t': 'a',
+'T': 'A',
+'n': 'n',
+'N': 'N'
+}
+
+
+def revcomp(seq):
+    '''
+    >>> revcomp('ATCGatcg')
+    'cgatCGAT'
+    '''
+    ret = []
+
+    for s in seq:
+        ret.append(_compliments[s])
+
+    ret.reverse()
+    return ''.join(ret)
+
+
+class Counts(object):
+    '''
+    Setup simple binning.  Bins are continuous 0->max.  Values are added to
+    bins and then means / distributions can be calculated.
+    '''
+    def __init__(self):
+        self.bins = []
+
+    def add(self, val):
+        while len(self.bins) <= val:
+            self.bins.append(0)
+        self.bins[val] += 1
+
+    def mean(self):
+        acc = 0
+        count = 0
+
+        for i, val in enumerate(self.bins):
+            acc += (i * val)
+            count += val
+
+        if count > 0:
+            return float(acc) / count
+
+    def max(self):
+        return len(self.bins) - 1
+
+
+def memoize(func):
+    if 'TESTING' in os.environ or 'DEBUG' in os.environ:
+        return func
+
+    __cache = {}
+    def inner(*args, **kwargs):
+        k = (args, tuple(kwargs.iteritems()))
+        if  k not in __cache:
+            __cache[k] = func(*args, **kwargs)
+        return __cache[k]
+
+    inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
+    return inner
+
+
+def quoted_split(s, delim, quote_char='"'):
+    tokens = []
+
+    buf = ""
+    inquote = False
+
+    for c in s:
+        if inquote:
+            buf += c
+            if c == quote_char:
+                inquote = False
+        elif c == delim:
+            tokens.append(buf)
+            buf = ""
+        else:
+            buf += c
+            if c == quote_char:
+                inquote = True
+
+    if buf:
+        tokens.append(buf)
+
+    return tokens
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.pyc
b
Binary file ngsutils/support/__init__.pyc has changed
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/bgzip.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/bgzip.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+'''
+Extract the blocks from a BGZip file.
+
+BAM files are stored as blocks in a bgzip archive. This class
+will load the bgzip archive and output the block information.
+'''
+
+import sys
+import os
+import struct
+
+
+class BGZip(object):
+    def __init__(self, fname):
+        self.fname = fname
+        self.pos = 0
+        self.fileobj = open(self.fname)
+        self.fsize = os.stat(self.fname).st_size
+        self.cur_chunk = 0
+
+        self.cpos = 0
+        self.cdata = 0
+
+    def close(self):
+        self.fileobj.close()
+
+    def seek(self, offset):
+        bgz_offset = offset >> 16
+        chunk_offset = offset & 0xFFFF
+
+        self.fileobj.seek(bgz_offset)
+        self.read_chunk()
+        self.chunk_pos = chunk_offset
+
+    def read(self, amount, whence=0):
+        if whence not in [0, 1]:
+            print "Bad Whence value!: %s" % whence
+            return
+
+        if whence == 0:
+            self.seek(0, 0)
+
+        ### read into chunk, if not enough data in chunk, read next chunk
+        ret = ''
+        while amount and self.pos < self.fsize:
+            if len(self.cdata) - self.cpos < amount:
+                ret += self.cdata[self.cpos:self.cpos + amount]
+                self.cpos += amount
+                return ret
+            else:
+                ret += self.cdata[self.cpos:]
+                amount = amount - len(ret)
+                self.read_chunk()
+        return ret
+
+    def read_chunk(self):
+        self.fileobj.seek(10)
+        id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
+        subpos = 0
+        bsize = 0
+
+        while subpos < xlen:
+            si1, si2, slen = self._read_fields('<BBH')
+            if si1 == 66 and si2 == 67:
+                bsize, = self._read_fields('<H')
+            else:
+                self.fileobj.seek(slen, 1)
+                self.pos += slen
+
+            subpos += 6 + slen
+
+        cdata_size = bsize - xlen - 19
+        self.cdata = self.fileobj.read(cdata_size)  # inflate value
+        self.fileobj.seek(8)
+
+        self.cur_chunk += 1
+        self.cpos = 0
+
+    def dump(self):
+        self.fileobj.seek(0)
+        block_num = 0
+
+        while self.pos < self.fsize:
+            print "[BLOCK %s]" % block_num
+            # read header
+            id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
+            print 'id1: %s' % id1
+            print 'id2: %s' % id2
+            print 'cm: %s' % cm
+            print 'flg: %s' % flg
+            print 'mtime: %s' % mtime
+            print 'xfl: %s' % xfl
+            print 'os: %s' % os
+            print 'xlen: %s' % xlen
+
+            # read subfields
+            subpos = 0
+            bsize = 0
+
+            while subpos < xlen:
+                si1, si2, slen = self._read_fields('<BBH')
+                print '    si1: %s' % si1
+                print '    si1: %s' % si2
+                print '    slen: %s' % slen
+                print '    data: [%s]' % slen
+
+                if si1 == 66 and si2 == 67:
+                    bsize, = self._read_fields('<H')
+                else:
+                    self.fileobj.seek(slen, 1)
+                    self.pos += slen
+
+                subpos += 6 + slen
+
+            cdata_size = bsize - xlen - 19
+
+            print 'bsize: %s' % bsize
+            print 'cdata: [%s]' % (cdata_size)
+
+            self.fileobj.seek(cdata_size, 1)
+            self.pos += cdata_size
+            crc, isize = self._read_fields('<II')
+
+            print "crc: %s" % crc
+            print "isize: %s" % isize
+            # print "POS: %s" % self.pos
+
+            block_num += 1
+
+    def _read_fields(self, field_types):
+        size = struct.calcsize(field_types)
+        self.pos += size
+        return struct.unpack(field_types, self.fileobj.read(size))
+
+if __name__ == '__main__':
+    print BGZip(sys.argv[1]).dump()
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/dbsnp.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,141 @@
+'''
+Support package for processing a dbSNP tabix dump from UCSC.
+'''
+
+import pysam
+import collections
+import sys
+from ngsutils.support import revcomp
+
+
+class SNPRecord(collections.namedtuple('SNPRecord', '''bin
+chrom
+chromStart
+chromEnd
+name
+score
+strand
+refNCBI
+refUCSC
+observed
+molType
+clazz
+valid
+avHet
+avHetSE
+func
+locType
+weight
+exceptions
+submitterCount
+submitters
+alleleFreqCount
+alleles
+alleleNs
+alleleFreqs
+bitfields
+''')):
+    __slots__ = ()
+
+    @property
+    def alleles(self):
+        alts = []
+        for alt in self.observed.split('/'):
+            if alt != '-' and self.strand == '-':
+                alt = revcomp(alt)
+
+            alts.append(alt)
+
+        return alts
+
+    @property
+    def snp_length(self):
+        return self.chromEnd - self.chromStart
+
+
+def autotype(ar, length=26):
+    out = []
+    for el in ar:
+        val = None
+        try:
+            val = int(el)
+        except:
+            try:
+                val = float(el)
+            except:
+                val = el
+
+        out.append(val)
+
+    if len(out) < 12:
+        raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.")
+
+    while len(out) < length:
+        out.append(None)
+    return out
+
+
+class DBSNP(object):
+    def __init__(self, fname):
+        self.dbsnp = pysam.Tabixfile(fname)
+        self.asTup = pysam.asTuple()
+
+    def fetch(self, chrom, pos):
+        'Note: pos is 0-based'
+
+        # Note: tabix the command uses 1-based positions, but
+        #       pysam.Tabixfile uses 0-based positions
+
+        for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup):
+            snp = SNPRecord._make(autotype(tup))
+            if snp.chromStart == pos:
+                yield snp
+
+    def close(self):
+        self.dbsnp.close()
+
+    def dump(self, chrom, op, pos, base, snp, exit=True):
+        print
+        print ' ->', op, chrom, pos, base
+        print snp
+        print snp.alleles
+        if exit:
+            sys.exit(1)
+
+    def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
+        for snp in self.fetch(chrom, pos):
+            if not '/' in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
+                # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
+                continue
+
+            if op == 0:
+                if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
+                    return True
+                # elif verbose:
+                #     for alt in snp.alleles:
+                #         if len(alt) > 1:
+                #             self.dump(chrom, op, pos, seq, snp, False)
+
+            elif op == 1:
+                if snp.clazz in ['insertion', 'mixed', 'in-del']:
+                    if seq in snp.alleles:
+                        if len(seq) > 1 and verbose:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        return True
+
+                    if verbose:
+                        if len(seq) > 1:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        else:
+                            for alt in snp.alleles:
+                                if len(alt) > 1:
+                                    self.dump(chrom, op, pos, seq, snp, False)
+
+            elif op == 2:
+                if snp.clazz in ['deletion', 'mixed', 'in-del']:
+                    if '-' in snp.alleles and seq in snp.alleles:
+                        if len(seq) > 1 and verbose:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        return True
+
+        return False
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.pyc
b
Binary file ngsutils/support/dbsnp.pyc has changed
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/llh.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/llh.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,55 @@
+'''
+Methods for calculating log-likelihoods for nucleotide frequencies
+'''
+import math
+import collections
+from ngsutils.support import memoize
+
+_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
+
+NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
+
+@memoize
+def pseudo_count(N, bg):
+    '''
+    >>> pseudo_count(100, _default_background['A'])
+    3
+    >>> pseudo_count(100, _default_background['C'])
+    2
+    '''
+
+    return bg * math.sqrt(N)
+
+
+def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
+    if pseudo == 'auto':
+        N = A + C + G + T
+        Ap = float(A) + pseudo_count(N, bg['A'])
+        Cp = float(C) + pseudo_count(N, bg['C'])
+        Gp = float(G) + pseudo_count(N, bg['G'])
+        Tp = float(T) + pseudo_count(N, bg['T'])
+    elif pseudo:
+        Ap = float(A) + pseudo
+        Cp = float(C) + pseudo
+        Gp = float(G) + pseudo
+        Tp = float(T) + pseudo
+    else:
+        Ap = float(A)
+        Cp = float(C)
+        Gp = float(G)
+        Tp = float(T)
+
+    Np = Ap + Cp + Gp + Tp
+
+    freqA = Ap / Np
+    freqC = Cp / Np
+    freqG = Gp / Np
+    freqT = Tp / Np
+
+    return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
+
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/ngs_utils.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+"""
+
+Common util classes / functions for the NGS project
+
+"""
+import sys
+import os
+import gzip
+import re
+import collections
+
+
+def format_number(n):
+    '''
+    >>> format_number(1000)
+    '1,000'
+    >>> format_number(1234567)
+    '1,234,567'
+    '''
+    ar = list(str(n))
+    for i in range(len(ar))[::-3][1:]:
+        ar.insert(i + 1, ',')
+    return ''.join(ar)
+
+
+def natural_sort(ar):
+    '''
+    >>> natural_sort('1 3 4 2 5'.split())
+    ['1', '2', '3', '4', '5']
+    >>> natural_sort('1 10 20 2 3 4'.split())
+    ['1', '2', '3', '4', '10', '20']
+    '''
+    to_sort = []
+    for item in ar:
+        spl = re.split('(\d+)', item)
+        l2 = []
+        for el in spl:
+            try:
+                n = int(el)
+            except:
+                n = el
+            l2.append(n)
+        to_sort.append((l2, item))
+
+    to_sort.sort()
+    return [x[1] for x in to_sort]
+
+
+def dictify(values, colnames):
+    """
+    Convert a list of values into a dictionary based upon given column names.
+
+    If the column name starts with an '@', the value is assumed to be a comma
+    separated list.
+
+    If the name starts with a '#', the value is assumed to be an int.
+
+    If the name starts with '@#', the value is assumed to  a comma separated
+    list of ints.
+
+    """
+    d = {}
+    for i in xrange(len(colnames)):
+        key = colnames[i]
+        split = False
+        num = False
+
+        if key[0] == '@':
+            key = key[1:]
+            split = True
+        if key[0] == '#':
+            key = key[1:]
+            num = True
+
+        if i < len(values):
+            if num and split:
+                val = [int(x) for x in values[i].rstrip(',').split(',')]
+            elif num:
+                val = int(values[i])
+            elif split:
+                val = values[i].rstrip(',').split(',')
+            else:
+                val = values[i]
+
+            d[key] = val
+
+        else:
+            d[key] = None
+
+    return d
+
+
+def gzip_aware_open(fname):
+    if fname == '-':
+        f = sys.stdin
+    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+        f = gzip.open(os.path.expanduser(fname))
+    else:
+        f = open(os.path.expanduser(fname))
+    return f
+
+
+class gzip_opener:
+    '''
+    A Python 2.6 class to handle 'with' opening of text files that may
+    or may not be gzip compressed.
+    '''
+    def __init__(self, fname):
+        self.fname = fname
+
+    def __enter__(self):
+        self.f = gzip_aware_open(self.fname)
+        return self.f
+
+    def __exit__(self, type, value, traceback):
+        if self.f != sys.stdin:
+            self.f.close()
+        return False
+
+
+def filenames_to_uniq(names, new_delim='.'):
+    '''
+    Given a set of file names, produce a list of names consisting of the
+    uniq parts of the names. This works from the end of the name.  Chunks of
+    the name are split on '.' and '-'.
+
+    For example:
+        A.foo.bar.txt
+        B.foo.bar.txt
+        returns: ['A','B']
+
+        AA.BB.foo.txt
+        CC.foo.txt
+        returns: ['AA.BB','CC']
+
+    >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
+    ['a', 'b']
+    >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
+    ['a.b', 'c']
+
+    '''
+    name_words = []
+    maxlen = 0
+    for name in names:
+        name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
+        name_words[-1].reverse()
+        if len(name_words[-1]) > maxlen:
+            maxlen = len(name_words[-1])
+
+    common = [False, ] * maxlen
+    for i in xrange(maxlen):
+        last = None
+        same = True
+        for nameword in name_words:
+            if i >= len(nameword):
+                same = False
+                break
+            if not last:
+                last = nameword[i]
+            elif nameword[i] != last:
+                same = False
+                break
+        common[i] = same
+
+    newnames = []
+    for nameword in name_words:
+        nn = []
+        for (i, val) in enumerate(common):
+            if not val and i < len(nameword):
+                nn.append(nameword[i])
+        nn.reverse()
+        newnames.append(new_delim.join(nn))
+    return newnames
+
+
+def parse_args(argv, defaults=None, expected_argc=0):
+    opts = {}
+    if defaults:
+        opts.update(defaults)
+
+    args = []
+
+    i = 0
+    while i < len(argv):
+        if argv[i][0] == '-':
+            arg = argv[i].lstrip('-')
+            if '=' in arg:
+                k, v = arg.split('=', 2)
+                if k in defaults:
+                    if type(defaults[k]) == float:
+                        opts[k] = float(v)
+                    elif type(defaults[k]) == int:
+                        opts[k] = int(v)
+                    else:
+                        opts[k] = v
+            else:
+                opts[arg] = True
+        else:
+            args.append(argv[i])
+        i += 1
+
+    while len(args) < expected_argc:
+        args.append(None)
+    return opts, args
+
+
+class memoize(object):
+    'Simple memoizing decorator to cache results'
+    def __init__(self, func):
+        self.func = func
+        self.cache = {}
+
+    def __call__(self, *args):
+        if not isinstance(args, collections.Hashable):
+            # uncacheable. a list, for instance.
+            # better to not cache than blow up.
+            return self.func(*args)
+
+        if args in self.cache:
+            return self.cache[args]
+        else:
+            value = self.func(*args)
+            self.cache[args] = value
+            return value
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.pyc
b
Binary file ngsutils/support/ngs_utils.pyc has changed
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/regions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/regions.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,166 @@
+class RangeMatch(object):
+    '''
+    Simple genomic ranges.  You can define chrom:start-end ranges, then ask if a
+    particular genomic coordinate maps to any of those ranges.  This is less-
+    efficient than an R-Tree, but easier to code.
+    '''
+    def __init__(self, name):
+        self.ranges = {}
+        self.name = name
+
+    def add_range(self, chrom, strand, start, end):
+        if not chrom in self.ranges:
+            self.ranges[chrom] = {}
+
+        bin = start / 100000
+        if not bin in self.ranges[chrom]:
+            self.ranges[chrom][bin] = []
+        self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+        if (end / 100000) != bin:
+            for bin in xrange(bin + 1, (end / 100000) + 1):
+                if not bin in self.ranges[chrom]:
+                    self.ranges[chrom][bin] = []
+                self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+    def get_tag(self, chrom, strand, pos, ignore_strand=False):
+        '''
+        returns (region, is_reverse_orientation)
+        '''
+        if not chrom in self.ranges:
+            return None, False
+        bin = pos / 100000
+        if not bin in self.ranges[chrom]:
+            return None, False
+        for start, end, r_strand in self.ranges[chrom][bin]:
+            if pos >= start and pos <= end:
+                if ignore_strand or strand == r_strand:
+                    return self.name, False
+                return self.name, True
+        return None, False
+
+
+class RegionTagger(object):
+    def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
+        self.regions = []
+        self.counts = {}
+        self.only_first_fragment = only_first_fragment
+
+        coding = RangeMatch('coding')
+        exons = RangeMatch('other-exon')
+        utr_5 = RangeMatch('utr-5')
+        utr_3 = RangeMatch('utr-3')
+        introns = RangeMatch('intron')
+        promoters = RangeMatch('promoter')
+
+        for gene in gtf.genes:
+            if valid_chroms and not gene.chrom in valid_chroms:
+                continue
+            if gene.strand == '+':
+                promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
+            else:
+                promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
+
+            for transcript in gene.transcripts:
+                if transcript.has_cds:
+                    for start, end in transcript.cds:
+                        coding.add_range(gene.chrom, gene.strand, start, end)
+
+                    # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
+                    for s, e in transcript.utr_5:
+                        utr_5.add_range(gene.chrom, gene.strand, s, e)
+                    for s, e in transcript.utr_3:
+                        utr_3.add_range(gene.chrom, gene.strand, s, e)
+
+                last_end = None
+                for start, end in transcript.exons:
+                    if last_end:
+                        introns.add_range(gene.chrom, gene.strand, last_end, start)
+                    exons.add_range(gene.chrom, gene.strand, start, end)
+                    last_end = end
+
+
+        self.regions.append(coding)
+        self.regions.append(utr_5)
+        self.regions.append(utr_3)
+        self.regions.append(exons)
+        self.regions.append(introns)
+        self.regions.append(promoters)
+
+        self.counts['coding'] = 0
+        self.counts['coding-rev'] = 0
+        self.counts['other-exon'] = 0
+        self.counts['utr-5'] = 0
+        self.counts['utr-3'] = 0
+        self.counts['utr-5-rev'] = 0
+        self.counts['utr-3-rev'] = 0
+        self.counts['intron'] = 0
+        self.counts['promoter'] = 0
+        self.counts['other-exon-rev'] = 0
+        self.counts['intron-rev'] = 0
+        self.counts['promoter-rev'] = 0
+        self.counts['junction'] = 0
+        self.counts['intergenic'] = 0
+        self.counts['mitochondrial'] = 0
+
+    def add_read(self, read, chrom):
+        if read.is_unmapped:
+            return
+        
+        if self.only_first_fragment and read.is_paired and not read.is_read1:
+            return
+
+        tag = None
+        is_rev = False
+
+        strand = '-' if read.is_reverse else '+'
+
+        if chrom == 'chrM':
+            tag = 'mitochondrial'
+
+        if not tag:
+            for op, length in read.cigar:
+                if op == 3:
+                    tag = 'junction'
+                    break
+
+        if not tag:
+            for region in self.regions:
+                tag, is_rev = region.get_tag(chrom, strand, read.pos)
+                if tag:
+                    break
+
+        if not tag:
+            tag = 'intergenic'
+
+        if tag:
+            if is_rev:
+                self.counts['%s-rev' % tag] += 1
+            else:
+                self.counts[tag] += 1
+
+        return tag
+
+    def tag_region(self, chrom, start, end, strand):
+        tag = None
+        is_rev = False
+
+        if chrom == 'chrM' or chrom == 'M':
+            tag = 'mitochondrial'
+
+        if not tag:
+            for region in self.regions:
+                tag, is_rev = region.get_tag(chrom, strand, start)
+                if is_rev:
+                    tag = '%s-rev' % tag
+
+                if start != end:
+                    endtag, is_rev = region.get_tag(chrom, strand, end)
+                    if is_rev:
+                        endtag = '%s-rev' % endtag
+
+                if tag and endtag and endtag != tag:
+                    tag = '%s/%s' % (tag, endtag)
+
+        if not tag:
+            tag = 'intergenic'
\ No newline at end of file
b
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/stats.py Wed Nov 11 13:04:07 2015 -0500
[
@@ -0,0 +1,161 @@
+'''
+various statistical tests and methods...
+'''
+import math
+from ngsutils.support import memoize
+
+def median(vals):
+    '''
+    >>> median([1,2,3])
+    2
+    >>> median([1,2,3,4])
+    2.5
+    '''
+    vals.sort()
+
+    if len(vals) % 2 == 1:
+        return vals[len(vals) / 2]
+    else:
+        a = vals[(len(vals) / 2) - 1]
+        b = vals[(len(vals) / 2)]
+        return float(a + b) / 2
+
+
+def mean_stdev(l):
+    '''
+    >>> mean_stdev([1,2,2,2])
+    (1.75, 0.5)
+    >>> mean_stdev([2,2,2,2])
+    (2.0, 0.0)
+    '''
+
+    acc = 0
+    for el in l:
+        acc += el
+
+    mean = float(acc) / len(l)
+    acc = 0
+    for el in l:
+        acc += (el - mean) ** 2
+
+    if len(l) > 2:
+        stdev = math.sqrt(float(acc) / (len(l) - 1))
+    else:
+        stdev = 0.0
+
+    return (mean, stdev)
+
+
+def counts_median(d):
+    '''
+    Calculate the median from counts stored in a dictionary
+    >>> counts_median({ 1: 4, 2: 1, 3: 4 })
+    2
+    >>> counts_median({ 1: 4, 3: 4 })
+    2
+
+    '''
+    count = 0
+    for k in d:
+        count += d[k]
+
+    if count == 0:
+        return 0
+
+    acc = 0.0
+    last = 0
+    for k in sorted(d):
+        if last:
+            return (last + k) / 2
+        acc += d[k]
+        if acc / count == 0.5:
+            last = k
+        elif acc / count > 0.5:
+            return k
+
+
+def counts_mean_stdev(d):
+    '''
+
+    calc mean / stdev when data is stored as counts in a dictionary
+
+    Ex:
+        { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
+
+    >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
+    (2.0, 1.0)
+
+    '''
+
+    acc = 0
+    count = 0
+    for k in d:
+        acc += k * d[k]
+        count += d[k]
+
+    mean = float(acc) / count
+
+    acc = 0
+    for k in d:
+        acc += (((k - mean) ** 2) * d[k])
+
+    if count > 2:
+        stdev = math.sqrt(float(acc) / (count - 1))
+    else:
+        stdev = 0.0
+
+    return (mean, stdev)
+
+@memoize
+def poisson_prob(x, mean):
+    '''
+        Return the probability that you could get x counts in
+        a Poisson test with a mean value.
+
+        prob(x) = sum(i=1..x){poisson(i)}
+
+        >>> poisson_prob(6,10)
+        0.1300960209527205
+        >>> poisson_prob(8,10)
+        0.33277427882095645
+    '''
+    acc = 0.0
+    for i in xrange(1, x+1):
+        acc += poisson_func(i, mean)
+    return acc
+
+@memoize
+def poisson_func(mu, lambd):
+    '''
+        This is the Poisson distribution function
+        
+        p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
+
+        mu is a count
+        lambd is the mean
+
+        >>> poisson_func(1,10)
+        0.00045399929762484856
+        >>> poisson_func(2,10)
+        0.0022699964881242427
+        >>> poisson_func(3,10)
+        0.007566654960414142
+    '''
+    return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
+
+
+@memoize
+def _factorial(x):
+    '''
+    >>> _factorial(1)
+    1
+    >>> _factorial(2)
+    2
+    >>> _factorial(3)
+    6
+    '''
+    return math.factorial(x)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
b
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_input1.bam
b
Binary file test-data/ngsutils_bam_filter_input1.bam has changed
b
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result1.bam
b
Binary file test-data/ngsutils_bam_filter_result1.bam has changed
b
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result2.bam
b
Binary file test-data/ngsutils_bam_filter_result2.bam has changed
b
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result3.bam
b
Binary file test-data/ngsutils_bam_filter_result3.bam has changed
b
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result4.bam
b
Binary file test-data/ngsutils_bam_filter_result4.bam has changed
b
diff -r 000000000000 -r 4e4e4093d65d tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Nov 11 13:04:07 2015 -0500
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="pysam" version="0.7.7">
+        <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>