changeset 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 8187a729d9f4
files bam_filter.xml filter.py macros.xml ngsutils/__init__.py ngsutils/__init__.pyc ngsutils/bam/__init__.py ngsutils/bam/__init__.pyc ngsutils/bed/__init__.py ngsutils/bed/__init__.pyc ngsutils/support/__init__.py ngsutils/support/__init__.pyc ngsutils/support/bgzip.py ngsutils/support/dbsnp.py ngsutils/support/dbsnp.pyc ngsutils/support/llh.py ngsutils/support/ngs_utils.py ngsutils/support/ngs_utils.pyc ngsutils/support/regions.py ngsutils/support/stats.py test-data/ngsutils_bam_filter_input1.bam test-data/ngsutils_bam_filter_result1.bam test-data/ngsutils_bam_filter_result2.bam test-data/ngsutils_bam_filter_result3.bam test-data/ngsutils_bam_filter_result4.bam tool_dependencies.xml
diffstat 24 files changed, 3534 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bam_filter.xml	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,231 @@
+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.0">
+    <description>Removes reads from a BAM file based on criteria</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <expand macro="version" />
+    <command><![CDATA[
+    ## If the tool is executed with no filtering option,
+    ## the default parameters simply copy over the input file
+    if grep -q "\w" ${parameters};
+    then
+        $__tool_directory__/filter.py
+        $infile
+        $outfile
+        `cat ${parameters}`;
+    else
+        cp $infile $outfile;
+    fi
+]]>
+    </command>
+    <configfiles>
+        <configfile name="parameters">
+<![CDATA[
+        #if $minlen:
+            -minlen $minlen
+        #end if
+        #if $maxlen
+            -maxlen $maxlen
+        #end if
+        $mapped
+        $unmapped
+        $properpair
+        $noproperpair
+        #if $mask:
+            -mask "${mask}"
+        #end if
+        #if int($uniq) > -1:
+            -uniq
+            #if int($uniq) > 0:
+                $uniq
+            #end if
+        #end if
+        $uniq_start
+        #if $mismatch:
+            -mismatch $mismatch
+        #end if
+        $nosecondary
+        $noqcfail
+        $nopcrdup
+        #if $excludebed:
+            -excludebed "${excludebed}" $ignore_strand
+        #end if
+        #if $includebed:
+            -includebed "${includebed}" $ignore_strand
+        #end if
+        #if $includeref:
+            -includeref "${includeref}"
+        #end if
+        #if $excluderef:
+            -excluderef "${excluderef}"
+        #end if
+        #if $maximum_mismatch_ratio
+            -maximum_mismatch_ratio $maximum_mismatch_ratio
+        #end if
+    ]]>
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="infile" type="data" format="bam" label="Select BAM dataset" />
+        <param argument="-minlen" type="integer" value="" optional="True" min="0"
+            label="Remove reads that are smaller than"
+            help="in bp"/>
+        <param argument="-maxlen" type="integer" value="" optional="True" min="0"
+            label="Remove reads that are larger than"
+            help="in bp"/>
+        <param argument="-mapped" truevalue="-mapped" type="boolean" falsevalue=""
+            label="Keep only mapped reads"
+            help="" />
+        <param argument="-unmapped" truevalue="-unmapped" type="boolean" falsevalue=""
+            label="Keep only unmapped reads"
+            help="" />
+        <param argument="-properpair" truevalue="-properpair" type="boolean" falsevalue=""
+            label="Keep only properly paired reads"
+            help="both mapped, correct orientation, flag set in BAM" />
+        <param argument="-noproperpair" truevalue="-noproperpair" type="boolean" falsevalue=""
+            label="Discard properly paired reads"
+            help="" />
+        <param argument="-mask" type="text" value="" optional="True" label="Remove reads that match the mask"
+            help="e.g. 0x400, 0x2" />
+
+        <param argument="-uniq" type="integer" value="-1" optional="True" min="-1"
+            label="Remove reads that have the same sequence"
+            help="up to the Nth nucleotide starting from the 5prime end. -1 means do not filter, 0 means check along the entire read,
+                10 would make filter reads that are unique over the first 10 nucleotides."/>
+
+        <param argument="-uniq_start" truevalue="-uniq_start" type="boolean" falsevalue=""
+            label="Remove reads that start at the same position"
+            help="Use only for low-coverage samples" />
+
+        <param argument="-mismatch" type="integer" value="" optional="True" min="0"
+            label="Remove reads with that many mismatches"
+            help="Indels always counts as 1 regardless of length. Requires NM tag to be set."/>
+
+        <param argument="-nosecondary" truevalue="-nosecondary" type="boolean" falsevalue=""
+            label="Remove secondary alignment reads"
+            help="Remove reads flagged with 0x100" />
+        <param argument="-noqcfail" truevalue="-noqcfail" type="boolean" falsevalue=""
+            label="Remove reads that do not pass the quality control"
+            help="Remove reads flagged with 0x200" />
+        <param argument="-nopcrdup" truevalue="-nopcrdup" type="boolean" falsevalue=""
+            label="Remove reads that are marked as PCR dupicates "
+            help="Remove reads flagged with 0x400" />
+
+        <param argument="-excludebed" type="data" optional="True" format="bed" label="Remove reads that are in any of the regions" />
+        <param argument="-includebed" type="data" optional="True" format="bed" label="Remove reads that are NOT any of the regions" />
+        <param name="ignore_strand" truevalue="nostrand" type="boolean" falsevalue=""
+            label="Strand information from BED file is ignored. Affects -excludebed and -includebed."
+            help="" />
+
+        <param argument="-includeref" type="text" value="" optional="True" label="Exclude reads NOT mapped to a reference"
+            help="" />
+        <param argument="-excluderef" type="text" value="" optional="True" label="Exclude reads mapped to a particular reference"
+            help="e.g. chrM, or _dup chromosomes" />
+
+        <param argument="-maximum_mismatch_ratio" type="float" value="" optional="True" min="0.0" max="1.0"
+            label="Filter by maximum mismatch ratio"
+            help="fraction of length"/>
+
+    </inputs>
+    <outputs>
+        <data format="bam" name="outfile" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" ftype="bam" value="ngsutils_bam_filter_input1.bam"/>
+            <param name="minlen" value="100"/>
+            <param name="maximum_mismatch_ratio" value="0.02"/>
+            <output name="outfile" file="ngsutils_bam_filter_result1.bam" ftype="bam" />
+        </test>
+        <test>
+            <param name="infile" ftype="bam" value="ngsutils_bam_filter_input1.bam"/>
+            <param name="minlen" value="250"/>
+            <param name="properpair" value="True"/>
+            <output name="outfile" file="ngsutils_bam_filter_result2.bam" ftype="bam" />
+        </test>
+        <test>
+            <param name="infile" ftype="bam" value="ngsutils_bam_filter_input1.bam"/>
+            <param name="minlen" value="100"/>
+            <param name="nosecondary" value="True"/>
+            <param name="noqcfail" value="True"/>
+            <param name="nopcrdup" value="True"/>
+            <output name="outfile" file="ngsutils_bam_filter_result3.bam" ftype="bam" />
+        </test>
+        <test>
+            <param name="infile" ftype="bam" value="ngsutils_bam_filter_input1.bam"/>
+            <output name="outfile" file="ngsutils_bam_filter_input1.bam" ftype="bam" />
+        </test>
+        <test>
+            <param name="infile" ftype="bam" value="ngsutils_bam_filter_input1.bam"/>
+            <param name="mask" value="0x40"/>
+            <output name="outfile" file="ngsutils_bam_filter_result4.bam" ftype="bam" />
+        </test>
+    </tests>
+    <help><![CDATA[
+Removes reads from a BAM file based on criteria.
+
+Given a BAM file, this tool will discard reads that did not meet the selected filtering criteria
+The output is another BAM file with the reads not matching the criteria removed.
+
+Note: this does not adjust tag values reflecting any filtering. (for example:
+      if a read mapped to two locations (IH:i:2), and one was removed by
+      filtering, the IH:i tag would still read IH:i:2).
+
+Currently, the available filters are:
+
++--------------------------------+-------------------------------------------------+
+| Agument                        | Description                                     |
++================================+=================================================+
+| -minlen val                    | Remove reads that are smaller than {val}        |
++--------------------------------+-------------------------------------------------+
+| -maxlen val                    | Remove reads that are larger than {val}         |
++--------------------------------+-------------------------------------------------+
+| -mapped                        | Keep only mapped reads                          |
++--------------------------------+-------------------------------------------------+
+| -unmapped                      | Keep only unmapped reads                        |
++--------------------------------+-------------------------------------------------+
+| -properpair                    | Keep only properly paired reads (both mapped,   |
+|                                | correct orientation, flag set in BAM)           |
++--------------------------------+-------------------------------------------------+
+| -noproperpair                  | Keep only not-properly paired reads             |
++--------------------------------+-------------------------------------------------+
+| -mask bitmask                  | Remove reads that match the mask (e.g. 0x400)   |
++--------------------------------+-------------------------------------------------+
+| -uniq {length}                 | Remove reads that are have the same sequence    |
+|                                | Note: BAM file should be sorted                 |
+|                                | (up to an optional length)                      |
++--------------------------------+-------------------------------------------------+
+| -uniq_start                    | Remove reads that start at the same position    |
+|                                | Note: BAM file should be sorted                 |
+|                                | (Use only for low-coverage samples)             |
++--------------------------------+-------------------------------------------------+
+|-mismatch num                   | Number of mismatches or indels                  |
+|                                | indel always counts as 1 regardless of length   |
+|                                | (requires NM tag in reads)                      |
++--------------------------------+-------------------------------------------------+
+|-nosecondary                    | Remove reads that have the 0x100 flag set       |
++--------------------------------+-------------------------------------------------+
+|-noqcfail                       | Remove reads that have the 0x200 flag set       |
++--------------------------------+-------------------------------------------------+
+|-nopcrdup                       | Remove reads that have the 0x400 flag set       |
++--------------------------------+-------------------------------------------------+
+|-excludebed file.bed {nostrand} | Remove reads that are in any of the regions     |
+|                                | from the given BED file. If 'nostrand' is given,|
+|                                | strand information from the BED file is ignored.|
++--------------------------------+-------------------------------------------------+
+|-includebed file.bed {nostrand} | Remove reads that are NOT any of the regions    |
+|                                | from the given BED file. If 'nostrand' is given,|
+|                                | strand information from the BED file is ignored.|
+|                                | Note: If this is a large dataset, use           |
+|                                | "bamutils extract" instead.                     |
++--------------------------------+-------------------------------------------------+
+| -includeref refname            | Exclude reads NOT mapped to a reference         |
++--------------------------------+-------------------------------------------------+
+| -excluderef refname            | Exclude reads mapped to a particular reference  |
+|                                | (e.g. chrM, or _dup chromosomes)                |
++--------------------------------+-------------------------------------------------+
+]]>
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,986 @@
+#!/usr/bin/env python
+## category General
+## desc Removes reads from a BAM file based on criteria
+"""
+Removes reads from a BAM file based on criteria
+
+Given a BAM file, this script will only allow reads that meet filtering
+criteria to be written to output. The output is another BAM file with the
+reads not matching the criteria removed.
+
+Note: this does not adjust tag values reflecting any filtering. (for example:
+      if a read mapped to two locations (IH:i:2), and one was removed by
+      filtering, the IH:i tag would still read IH:i:2).
+
+Currently, the available filters are:
+    -minlen val                Remove reads that are smaller than {val}
+    -maxlen val                Remove reads that are larger than {val}
+    -mapped                    Keep only mapped reads
+    -unmapped                  Keep only unmapped reads
+    -properpair                Keep only properly paired reads (both mapped, 
+                               correct orientation, flag set in BAM)
+    -noproperpair              Keep only not-properly paired reads
+
+    -mask bitmask              Remove reads that match the mask (base 10/hex)
+    -uniq {length}             Remove reads that are have the same sequence
+                               Note: BAM file should be sorted
+                               (up to an optional length)
+    -uniq_start                Remove reads that start at the same position
+                               Note: BAM file should be sorted
+                               (Use only for low-coverage samples)
+
+    -mismatch num              # mismatches or indels
+                               indel always counts as 1 regardless of length
+                               (requires NM tag in reads)
+
+    -mismatch_dbsnp num dbsnp.txt.bgz
+                               # mismatches or indels - not in dbSNP.
+                               Variations are called using the MD tag.
+                               Variations that are found in the dbSNP list are
+                               not counted as mismatches. The dbSNP list is a
+                               Tabix-indexed dump of dbSNP (from UCSC Genome
+                               Browser). Indels in dbSNP are also counted.
+                               Adds a 'ZS:i' tag with the number of found SNPs
+                               in the read.
+                               (requires NM and MD tags)
+
+                               Example command for indexing:
+                               ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0
+
+    -mismatch_ref num ref.fa   # mismatches or indel - looks up mismatches
+                               directly in a reference FASTA file
+                               (use if NM tag not present)
+
+    -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz
+                               # mismatches or indels - looks up mismatches
+                               directly from a reference FASTA file. (See
+                               -mismatch_dbsnp for dbSNP matching)
+                               (use if NM or MD tag not present)
+
+    -nosecondary               Remove reads that have the 0x100 flag set
+    -noqcfail                  Remove reads that have the 0x200 flag set
+    -nopcrdup                  Remove reads that have the 0x400 flag set
+
+
+    -exclude ref:start-end     Remove reads in this region (1-based start)
+    -excludebed file.bed {nostrand}
+                               Remove reads that are in any of the regions
+                               from the given BED file. If 'nostrand' is given,
+                               strand information from the BED file is ignored.
+
+    -include ref:start-end     Remove reads NOT in the region (can only be one)
+    -includebed file.bed {nostrand}
+                               Remove reads that are NOT any of the regions
+                               from the given BED file. If 'nostrand' is given,
+                               strand information from the BED file is ignored.
+
+                               Note: If this is a large dataset, use
+                               "bamutils extract" instead.
+
+    -includeref refname        Exclude reads NOT mapped to a reference
+    -excluderef refname        Exclude reads mapped to a particular reference
+                               (e.g. chrM, or _dup chromosomes)
+
+    -whitelist fname           Remove reads that aren't on this list (by name)
+    -blacklist fname           Remove reads that are on this list (by name)
+                                 These lists can be whitespace-delimited with
+                                 the read name as the first column.
+    -maximum_mismatch_ratio val
+                               Filter by maximum mismatch ratio (fraction of length)
+
+    -eq  tag_name value
+    -lt  tag_name value
+    -lte tag_name value
+    -gt  tag_name value
+    -gte tag_name value
+
+    As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ
+    value will be used.
+
+Common tags to filter by:
+    AS      Alignment score
+    IH      Number of alignments
+    NM      Edit distance (each indel counts as many as its length)
+
+    MAPQ    Mapping quality (defined as part of SAM spec)
+
+    The tag type (:i, :f, :Z) is optional.
+
+"""
+
+import os
+import sys
+import pysam
+from ngsutils.bam import bam_iter
+from ngsutils.support.dbsnp import DBSNP
+from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations
+from ngsutils.bed import BedFile
+
+
+def usage():
+    print __doc__
+    print """
+Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria...
+
+Options:
+  -failed fname    A text file containing the read names of all reads
+                   that were removed with filtering
+
+Example:
+bamutils filter filename.bam output.bam -mapped -gte AS:i 1000
+
+This will remove all unmapped reads, as well as any reads that have an AS:i
+value less than 1000.
+"""
+    sys.exit(1)
+
+
+class Unique(object):
+    def __init__(self, length=None):
+        if length:
+            self.length = int(length)
+        else:
+            self.length = None
+
+        self.last_pos = None
+        self.pos_reads = set()
+
+    def __repr__(self):
+        return "uniq"
+
+    def filter(self, bam, read):
+        if self.last_pos != (read.tid, read.pos):
+            self.last_pos = (read.tid, read.pos)
+            self.pos_reads = set()
+
+        if read.is_reverse:
+            seq = read.seq[::-1]  # ignore revcomp for now, it isn't needed, just need to compare in the proper order
+        else:
+            seq = read.seq
+
+        if self.length:
+            seq = seq[:self.length]
+
+        if seq in self.pos_reads:
+            return False
+
+        self.pos_reads.add(seq)
+        return True
+
+    def close(self):
+        pass
+
+
+class UniqueStart(object):
+    def __init__(self):
+        self.last_tid = None
+        self.last_fwd_pos = -1
+        self.rev_pos_list = None
+
+    def __repr__(self):
+        return "uniq_start"
+
+    def filter(self, bam, read):
+        if self.last_tid is None or self.last_tid != read.tid:
+            self.last_tid = read.tid
+            self.rev_pos = set()
+            self.last_fwd_pos = -1
+            self.last_rev_pos = -1
+
+        if read.is_reverse:
+            # check reverse reads from their start (3' aend)
+            # these aren't necessarily in the correct
+            # order in the file, so we have to track them in a set
+
+            start_pos = read.aend
+
+            # clean up hash if necesary
+            # Allow up to 100k over, to balance cleaning up the rev_pos hash
+            # and memory
+            if read.pos > (self.last_rev_pos + 100000):
+                self.last_rev_pos = start_pos
+                del_list = []
+                for k in self.rev_pos:
+                    if k < read.pos:
+                        del_list.append(k)
+
+                for k in del_list:
+                    self.rev_pos.remove(k)
+
+            if not start_pos in self.rev_pos:
+                self.rev_pos.add(start_pos)
+                return True
+            return False
+        else:
+            if read.pos != self.last_fwd_pos:
+                self.last_fwd_pos = read.pos
+                return True
+            return False
+
+        # if self.last_pos != (read.tid, read.pos):
+        #     self.last_pos = (read.tid, read.pos)
+        #     return True
+        # return False
+
+    def close(self):
+        pass
+
+
+class Blacklist(object):
+    def __init__(self, fname):
+        self.fname = fname
+        self.notallowed = set()
+        with open(fname) as f:
+            for line in f:
+                self.notallowed.add(line.strip().split()[0])
+
+    def filter(self, bam, read):
+        return read.qname not in self.notallowed
+
+    def __repr__(self):
+        return 'Blacklist: %s' % (self.fname)
+
+    def close(self):
+        pass
+
+
+class Whitelist(object):
+    def __init__(self, fname):
+        self.fname = fname
+        self.allowed = set()
+        with open(fname) as f:
+            for line in f:
+                self.allowed.add(line.strip().split()[0])
+
+    def filter(self, bam, read):
+        return read.qname in self.allowed
+
+    def __repr__(self):
+        return 'Whitelist: %s' % (self.fname)
+
+    def close(self):
+        pass
+
+
+class IncludeRegion(object):
+    _excludes = []
+    _last = None
+
+    def __init__(self, region):
+        IncludeRegion._excludes.append(ExcludeRegion(region))
+        # self.excl = ExcludeRegion(region)
+
+    def filter(self, bam, read):
+        if read == IncludeRegion._last:
+            return True
+
+        IncludeRegion._last = read
+
+        for excl in IncludeRegion._excludes:
+            if not excl.filter(bam, read):
+                return True
+        return False
+
+        # return not self.excl.filter(bam,read)
+    def __repr__(self):
+        return 'Including: %s' % (self.excl.region)
+
+    def close(self):
+        pass
+
+
+class IncludeBED(object):
+    def __init__(self, fname, nostrand=None):
+        self.excl = ExcludeBED(fname, nostrand)
+
+    def filter(self, bam, read):
+        return not self.excl.filter(bam, read)
+
+    def __repr__(self):
+        return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '')
+
+    def close(self):
+        pass
+
+
+class ExcludeRegion(object):
+    def __init__(self, region):
+        self.region = region
+        spl = region.split(':')
+        self.chrom = spl[0]
+        se = [int(x) for x in spl[1].split('-')]
+        self.start = se[0] - 1
+        self.end = se[1]
+
+    def filter(self, bam, read):
+        if not read.is_unmapped:
+            if bam.getrname(read.tid) == self.chrom:
+                if self.start <= read.pos <= self.end:
+                    return False
+                if self.start <= read.aend <= self.end:
+                    return False
+        return True
+
+    def __repr__(self):
+        return 'Excluding: %s' % (self.region)
+
+    def close(self):
+        pass
+
+
+class ExcludeRef(object):
+    def __init__(self, ref):
+        self.ref = ref
+
+    def filter(self, bam, read):
+        if not read.is_unmapped:
+            if bam.getrname(read.tid) == self.ref:
+                    return False
+        return True
+
+    def __repr__(self):
+        return 'Excluding: %s' % (self.ref)
+
+    def close(self):
+        pass
+
+class IncludeRef(object):
+    def __init__(self, ref):
+        self.ref = ref
+
+    def filter(self, bam, read):
+        if not read.is_unmapped:
+            if bam.getrname(read.tid) == self.ref:
+                    return True
+        return False
+
+    def __repr__(self):
+        return 'Including: %s' % (self.ref)
+
+    def close(self):
+        pass
+
+
+class ExcludeBED(object):
+    def __init__(self, fname, nostrand=None):
+        self.regions = {}  # store BED regions as keyed bins (chrom, bin)
+        self.fname = fname
+        if nostrand == 'nostrand':
+            self.nostrand = True
+        else:
+            self.nostrand = False
+
+        self.bed = BedFile(fname)
+        # with open(fname) as f:
+        #     for line in f:
+        #         if not line:
+        #             continue
+        #         if line[0] == '#':
+        #             continue
+        #         cols = line.strip().split('\t')
+
+        #         chrom = cols[0]
+        #         start = int(cols[1])
+        #         end = int(cols[2])
+        #         if self.nostrand:
+        #             strand = '?'
+        #         else:
+        #             strand = cols[5]
+
+        #         startbin = start / 100000
+        #         endbin = end / 100000
+
+        #         for bin in xrange(startbin, endbin + 1):
+        #             if not (chrom, bin) in self.regions:
+        #                 self.regions[(chrom, bin)] = []
+        #         self.regions[(chrom, bin)].append((start, end, strand))
+
+    def filter(self, bam, read):
+        if not read.is_unmapped:
+            if self.nostrand:
+                strand = None
+            elif read.is_reverse:
+                strand = '-'
+            else:
+                strand = '+'
+
+            for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand):
+                # region found, exclude read
+                return False
+            return True
+
+        #     bin = read.pos / 100000
+        #     ref = bam.getrname(read.tid)
+
+        #     if not (ref, bin) in self.regions:
+        #         return True
+
+        #     for start, end, strand in self.regions[(ref, bin)]:
+        #         if not self.nostrand:
+        #             if strand == '+' and read.is_reverse:
+        #                 continue
+        #             if strand == '-' and not read.is_reverse:
+        #                 continue
+        #         if start <= read.pos <= end:
+        #             return False
+        #         if start <= read.aend <= end:
+        #             return False
+        # return True
+
+    def __repr__(self):
+        return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '')
+
+    def close(self):
+        pass
+
+
+class Mismatch(object):
+    def __init__(self, num):
+        self.num = int(num)
+
+    def filter(self, bam, read):
+        if read.is_unmapped:
+            return False
+
+        if read_calc_mismatches(read) > self.num:
+            return False
+
+        return True
+
+    def __repr__(self):
+        return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es')
+
+    def close(self):
+        pass
+
+
+class MismatchRef(object):
+    def __init__(self, num, refname):
+        self.num = int(num)
+        self.refname = refname
+
+        if not os.path.exists('%s.fai' % refname):
+            pysam.faidx(refname)
+
+        self.ref = pysam.Fastafile(refname)
+
+    def filter(self, bam, read):
+        if read.is_unmapped:
+            return False
+
+        chrom = bam.getrname(read.tid)
+        if read_calc_mismatches_ref(self.ref, read, chrom) > self.num:
+            return False
+
+        return True
+
+    def __repr__(self):
+        return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname))
+
+    def close(self):
+        self.ref.close()
+
+
+class MismatchDbSNP(object):
+    def __init__(self, num, fname, verbose=None):
+        sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n')
+
+        self.num = int(num)
+        self.fname = fname
+        self.dbsnp = DBSNP(fname)
+        if verbose == 'verbose':
+            self.verbose = True
+        else:
+            self.verbose = False
+
+    def filter(self, bam, read):
+        if read.is_unmapped:
+            return False
+
+        if read_calc_mismatches(read) <= self.num:
+            return True
+
+        chrom = bam.getrname(read.tid)
+
+        mm = 0
+        snps = 0
+
+        for op, pos, seq in read_calc_variations(read):
+            if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose):
+                mm += 1
+            else:
+                snps += 1
+
+        if mm > self.num:
+            return False
+
+        if snps:
+            read.tags = read.tags + [('ZS', snps)]
+
+        return True
+
+    def __repr__(self):
+        return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname))
+
+    def close(self):
+        self.dbsnp.close()
+
+
+class MismatchRefDbSNP(object):
+    def __init__(self, num, refname, dbsnpname):
+        sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n')
+        self.num = int(num)
+        self.refname = refname
+        self.dbsnp = DBSNP(dbsnpname)
+
+        if not os.path.exists('%s.fai' % refname):
+            pysam.faidx(refname)
+
+        self.ref = pysam.Fastafile(refname)
+
+    def filter(self, bam, read):
+        if read.is_unmapped:
+            return False
+
+        chrom = bam.getrname(read.tid)
+
+        mm = 0
+        snps = 0
+
+        for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom):
+            if not self.dbsnp.is_valid_variation(chrom, op, pos, seq):
+                mm += 1
+            else:
+                snps += 1
+
+        if mm > self.num:
+            return False
+
+        if snps:
+            read.tags = read.tags + [('ZS', snps)]
+
+        return True
+
+    def __repr__(self):
+        return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname))
+
+    def close(self):
+        self.ref.close()
+        self.dbsnp.close()
+
+
+class Mapped(object):
+    def __init__(self):
+        pass
+
+    def filter(self, bam, read):
+        if read.is_paired and (read.is_unmapped or read.mate_is_unmapped):
+            return False
+        elif read.is_unmapped:
+            return False
+        return True
+
+    def __repr__(self):
+        return 'is mapped'
+
+    def close(self):
+        pass
+
+
+class Unmapped(object):
+    def __init__(self):
+        pass
+
+    def filter(self, bam, read):
+        if read.is_paired and (read.is_unmapped or read.mate_is_unmapped):
+            return True
+        elif read.is_unmapped:
+            return True
+        return False
+
+    def __repr__(self):
+        return 'is unmapped'
+
+    def close(self):
+        pass
+
+
+class ProperPair(object):
+    def __init__(self):
+        pass
+
+    def filter(self, bam, read):
+        if not read.is_paired:
+            return False
+
+        if read.is_unmapped or read.mate_is_unmapped:
+            return False
+
+        if read.is_reverse == read.mate_is_reverse:
+            return False
+
+        return read.is_proper_pair
+
+    def __repr__(self):
+        return 'proper pair'
+
+    def close(self):
+        pass
+
+
+class NoProperPair(object):
+    def __init__(self):
+        self.proper = ProperPair()
+        pass
+
+    def filter(self, bam, read):
+        return not self.proper.filter(bam, read)
+
+    def __repr__(self):
+        return 'not proper pairs'
+
+    def close(self):
+        pass
+
+
+class MaskFlag(object):
+    def __init__(self, value):
+        if type(value) == type(1):
+            self.flag = value
+        else:
+            if value[0:2] == '0x':
+                self.flag = int(value, 16)
+            else:
+                self.flag = int(value)
+
+    def __repr__(self):
+        return "Doesn't match flag: %s" % self.flag
+
+    def filter(self, bam, read):
+        return (read.flag & self.flag) == 0
+
+    def close(self):
+        pass
+
+
+class SecondaryFlag(object):
+    def __repr__(self):
+        return "no 0x100 (secondary) flag"
+
+    def filter(self, bam, read):
+        return not read.is_secondary
+
+    def close(self):
+        pass
+
+
+class ReadMinLength(object):
+    def __init__(self, minval):
+        self.minval = int(minval)
+
+    def __repr__(self):
+        return "read length min: %s" % self.minval
+
+    def filter(self, bam, read):
+        return len(read.seq) >= self.minval
+
+    def close(self):
+        pass
+
+
+class ReadMaxLength(object):
+    def __init__(self, val):
+        self.val = int(val)
+
+    def __repr__(self):
+        return "read length max: %s" % self.val
+
+    def filter(self, bam, read):
+        return len(read.seq) <= self.val
+
+    def close(self):
+        pass
+
+
+class MaximumMismatchRatio(object):
+    def __init__(self, ratio):
+        self.ratio = float(ratio)
+
+    def __repr__(self):
+        return "maximum mismatch ratio: %s" % self.val
+
+    def filter(self, bam, read):
+        return read_calc_mismatches(read) <= self.ratio*len(read.seq)
+
+    def close(self):
+        pass
+
+
+class QCFailFlag(object):
+    def __repr__(self):
+        return "no 0x200 (qcfail) flag"
+
+    def filter(self, bam, read):
+        return not read.is_qcfail
+
+    def close(self):
+        pass
+
+
+class PCRDupFlag(object):
+    def __repr__(self):
+        return "no 0x400 (pcrdup) flag"
+
+    def filter(self, bam, read):
+        return not read.is_duplicate
+
+    def close(self):
+        pass
+
+
+class _TagCompare(object):
+    def __init__(self, tag, value):
+        self.args = '%s %s' % (tag, value)
+
+        if ':' in tag:
+            self.tag = tag.split(':')[0]
+            tagtype = tag.split(':')[1]
+            if tagtype == 'i':
+                self.value = int(value)
+            elif tagtype == 'f':
+                self.value = float(value)
+            elif tagtype == 'H':
+                self.value = int(value)  # not sure about this
+            else:
+                self.value = value
+        else:
+            self.tag = tag
+
+            # guess at type...
+            try:
+                self.value = int(value)
+            except:
+                try:
+                    self.value = float(value)
+                except:
+                    self.value = value
+
+    def get_value(self, read):
+        if self.tag == 'MAPQ':
+            return read.mapq
+
+        for name, value in read.tags:
+            if name == self.tag:
+                return value
+
+        return None
+
+    def __repr__(self):
+        return "%s %s %s" % (self.tag, self.__class__.op, self.value)
+
+    def close(self):
+        pass
+
+
+class TagLessThan(_TagCompare):
+    op = '<'
+
+    def filter(self, bam, read):
+        if self.get_value(read) < self.value:
+            return True
+        return False
+
+
+class TagLessThanEqual(_TagCompare):
+    op = '<='
+
+    def filter(self, bam, read):
+        if self.get_value(read) <= self.value:
+            return True
+        return False
+
+
+class TagGreaterThan(_TagCompare):
+    op = '>'
+
+    def filter(self, bam, read):
+        if self.get_value(read) > self.value:
+            return True
+        return False
+
+
+class TagGreaterThanEqual(_TagCompare):
+    op = '>='
+
+    def filter(self, bam, read):
+        if self.get_value(read) >= self.value:
+            return True
+        return False
+
+
+class TagEqual(_TagCompare):
+    op = '='
+
+    def filter(self, bam, read):
+        if self.get_value(read) == self.value:
+            return True
+        return False
+
+_criteria = {
+    'mapped': Mapped,
+    'unmapped': Unmapped,
+    'properpair': ProperPair,
+    'noproperpair': NoProperPair,
+    'noqcfail': QCFailFlag,
+    'nosecondary': SecondaryFlag,
+    'nopcrdup': PCRDupFlag,
+    'mask': MaskFlag,
+    'lt': TagLessThan,
+    'gt': TagGreaterThan,
+    'lte': TagLessThanEqual,
+    'gte': TagGreaterThanEqual,
+    'eq': TagEqual,
+    'mismatch': Mismatch,
+    'mismatch_ref': MismatchRef,
+    'mismatch_dbsnp': MismatchDbSNP,
+    'mismatch_ref_dbsnp': MismatchRefDbSNP,
+    'exclude': ExcludeRegion,
+    'excludebed': ExcludeBED,
+    'excluderef': ExcludeRef,
+    'includeref': IncludeRef,
+    'include': IncludeRegion,
+    'includebed': IncludeBED,
+    'whitelist': Whitelist,
+    'blacklist': Blacklist,
+    'length': ReadMinLength,  # deprecated
+    'minlen': ReadMinLength,
+    'maxlen': ReadMaxLength,
+    'uniq': Unique,
+    'maximum_mismatch_ratio': MaximumMismatchRatio
+}
+
+
+def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False):
+    if verbose:
+        sys.stderr.write('Input file  : %s\n' % infile)
+        sys.stderr.write('Output file : %s\n' % outfile)
+        if failedfile:
+            sys.stderr.write('Failed reads: %s\n' % failedfile)
+        sys.stderr.write('Criteria:\n')
+        for criterion in criteria:
+            sys.stderr.write('    %s\n' % criterion)
+
+        sys.stderr.write('\n')
+
+    bamfile = pysam.Samfile(infile, "rb")
+    outfile = pysam.Samfile(outfile, "wb", template=bamfile)
+
+    if failedfile:
+        failed_out = open(failedfile, 'w')
+    else:
+        failed_out = None
+
+    passed = 0
+    failed = 0
+
+    def _callback(read):
+        return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed)
+
+    for read in bam_iter(bamfile, quiet=True):
+        p = True
+
+        for criterion in criteria:
+            if not criterion.filter(bamfile, read):
+                p = False
+                failed += 1
+                if failed_out:
+                    failed_out.write('%s\t%s\n' % (read.qname, criterion))
+                #outfile.write(read_to_unmapped(read))
+                break
+        if p:
+            passed += 1
+            outfile.write(read)
+
+    bamfile.close()
+    outfile.close()
+    if failed_out:
+        failed_out.close()
+    sys.stdout.write("%s kept\n%s failed\n" % (passed, failed))
+
+    for criterion in criteria:
+        criterion.close()
+
+
+def read_to_unmapped(read):
+    '''
+    Example unmapped read
+    2358_2045_1839    |  4 | *                             |  0    |  0   | *                  | *  |  0 |  0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC           | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII           | PG:Z:bfast   | AS:i:-                     | NH:i:0   | IH:i:1   | HI:i:1                                     | CS:Z:T10213222020221330022203222011113021130222212230122             | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%=            | CM:i:0   | XA:i:4
+
+    Example mapped read:
+    2216_1487_198     |  16 | chr11:19915291-19925392      |  5531 |   12 | 24M2D26M           | *  |  0 |  0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA           | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^`           | PG:Z:bfast   | AS:i:1150   | NM:i:2   | NH:i:10   | IH:i:1   | HI:i:1   | MD:Z:24^CT26                   | CS:Z:T00103022001002113210023031213331122121223321221122             | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1            | CM:i:9   | XA:i:4   | XE:Z:-------23322---2-11----2----------------------------
+
+    '''
+    # TODO: rewrite read properties to unmapped state
+    #       if colorspace: convert CS to NT directly
+    #       remove excess tags
+
+    read.is_unmapped = True
+    read.tid = None
+    read.pos = 0
+    read.mapq = 0
+    return read
+
+if __name__ == '__main__':
+    infile = None
+    outfile = None
+    failed = None
+    criteria = []
+
+    crit_args = []
+    last = None
+    verbose = False
+    fail = False
+
+    for arg in sys.argv[1:]:
+        if last == '-failed':
+            failed = arg
+            last = None
+        elif arg == '-h':
+            usage()
+        elif arg == '-failed':
+            last = arg
+        elif arg == '-v':
+            verbose = True
+        elif not infile and os.path.exists(arg):
+            infile = arg
+        elif not outfile:
+            outfile = arg
+        elif arg[0] == '-':
+            if not arg[1:] in _criteria:
+                print "Unknown criterion: %s" % arg
+                fail = True
+            if crit_args:
+                criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
+            crit_args = [arg, ]
+        elif crit_args:
+            crit_args.append(arg)
+        else:
+            print "Unknown argument: %s" % arg
+            fail = True
+
+    if not fail and crit_args:
+        criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
+
+    if fail or not infile or not outfile or not criteria:
+        if not infile and not outfile and not criteria:
+            usage()
+
+        if not infile:
+            print "Missing: input bamfile"
+        if not outfile:
+            print "Missing: output bamfile"
+        if not criteria:
+            print "Missing: filtering criteria"
+        usage()
+    else:
+        bam_filter(infile, outfile, criteria, failed, verbose)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,16 @@
+<macros>
+    <token name="@WRAPPER_VERSION@">0.5.7</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.7.7">pysam</requirement>
+        </requirements>
+    </xml>
+    <xml name="version">
+        <version_command>@WRAPPER_VERSION@</version_command>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" level="fatal" />
+        </stdio>
+    </xml>
+</macros>
Binary file ngsutils/__init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bam/__init__.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,879 @@
+import sys
+import os
+import re
+import pysam
+try:
+    from eta import ETA
+except:
+    pass
+import ngsutils.support
+
+
+def bam_open(fname, mode='r', *args, **kwargs):
+    if fname.lower()[-4:] == '.bam':
+        return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)
+    return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)
+
+
+def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
+    if not quiet and bam.filename:
+        eta = ETA(os.stat(bam.filename).st_size)
+    else:
+        eta = None
+
+    for pileup in bam.pileup(mask=mask):
+        pos = bam.tell()
+        bgz_offset = pos >> 16
+
+        if not quiet:
+            if callback:
+                eta.print_status(bgz_offset, extra=callback(pileup))
+            else:
+                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))
+
+        yield pileup
+
+    if eta:
+        eta.done()
+
+
+def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
+    '''
+    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
+    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
+    '''
+
+    if os.path.exists('%s.bai' % bam.filename):
+        # This is an indexed file, so it is ref sorted...
+        # Meaning that we should show chrom:pos, instead of read names
+        show_ref_pos = True
+
+    eta = None
+
+    if not ref:
+        if not quiet and bam.filename:
+            eta = ETA(os.stat(bam.filename).st_size)
+
+        for read in bam:
+            pos = bam.tell()
+            bgz_offset = pos >> 16
+
+            if not quiet and eta:
+                if callback:
+                    eta.print_status(bgz_offset, extra=callback(read))
+                elif (show_ref_pos):
+                    if read.tid > -1:
+                        eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
+                    else:
+                        eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
+                else:
+                    eta.print_status(bgz_offset, extra='%s' % read.qname)
+
+            yield read
+
+    else:
+        working_chrom = None
+        if ref in bam.references:
+            working_chrom = ref
+        elif ref[0:3] == 'chr':
+            # compensate for Ensembl vs UCSC ref naming
+            if ref[3:] in bam.references:
+                working_chrom = ref[3:]
+
+        if not working_chrom:
+            raise ValueError('Missing reference: %s' % ref)
+
+        tid = bam.gettid(working_chrom)
+
+        if not start:
+            start = 0
+        if not end:
+            end = bam.lengths[tid]
+
+        if not quiet and bam.filename:
+            eta = ETA(end - start)
+
+        for read in bam.fetch(working_chrom, start, end):
+            if not quiet and eta:
+                if callback:
+                    eta.print_status(read.pos - start, extra=callback(read))
+                else:
+                    eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
+
+            yield read
+
+    if eta:
+        eta.done()
+
+
+def bam_batch_reads(bam, quiet=False):
+    '''
+    Batch mapping for the same reads (qname) together, this way
+    they can all be compared/converted together.
+    '''
+    reads = []
+    last = None
+    for read in bam_iter(bam, quiet=quiet):
+        if last and read.qname != last:
+            yield reads
+            reads = []
+        last = read.qname
+        reads.append(read)
+
+    if reads:
+        yield reads
+
+
+bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']
+bam_cigar_op = {
+    'M': 0,
+    'I': 1,
+    'D': 2,
+    'N': 3,
+    'S': 4,
+    'H': 5,
+    'P': 6,
+    '=': 7,
+    'X': 8,
+}
+
+
+def cigar_fromstr(s):
+    '''
+    >>> cigar_fromstr('10M5I20M')
+    [(0, 10), (1, 5), (0, 20)]
+    >>> cigar_fromstr('1M1I1D1N1S1H1P')
+    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
+    '''
+    ret = []
+    spl = re.split('([0-9]+)', s)[1:]
+    for op, size in zip(spl[1::2], spl[::2]):
+        ret.append((bam_cigar_op[op], int(size)))
+    return ret
+
+
+def cigar_tostr(cigar):
+    '''
+    >>> cigar_tostr(((0, 10), (1, 5), (0, 20)))
+    '10M5I20M'
+    >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)))
+    '1M1I1D1N1S1H1P'
+    '''
+    
+    s = ''
+
+    for op, size in cigar:
+        s += '%s%s' % (size, bam_cigar[op])
+
+    return s
+
+
+def cigar_read_len(cigar):
+    '''
+    >>> cigar_read_len(cigar_fromstr('8M'))
+    8
+    >>> cigar_read_len(cigar_fromstr('8M100N8M'))
+    16
+    >>> cigar_read_len(cigar_fromstr('8M10I8M'))
+    26
+    >>> cigar_read_len(cigar_fromstr('8M10D8M'))
+    16
+    '''
+
+    read_pos = 0
+
+    for op, length in cigar:
+        if op == 0:  # M
+            read_pos += length
+        elif op == 1:  # I
+            read_pos += length
+        elif op == 2:  # D
+            pass
+        elif op == 3:  # N
+            pass
+        elif op == 4:  # S
+            read_pos += length
+        elif op == 5:  # H
+            pass
+        elif op == 7:  # =
+            read_pos += length
+        elif op == 8:  # X
+            read_pos += length
+        else:
+            raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+    return read_pos
+
+
+def read_calc_mismatches(read):
+    inserts = 0
+    deletions = 0
+    indels = 0
+    edits = int(read.opt('NM'))
+    #
+    # NM counts the length of indels
+    # We really just care about *if* there is an indel, not the size
+    #
+
+    for op, length in read.cigar:
+        if op == 1:
+            inserts += length
+            indels += 1
+        elif op == 2:
+            deletions += length
+            indels += 1
+
+    return edits - inserts - deletions + indels
+
+
+def _extract_md_matches(md, maxlength):
+    md_pos = 0
+
+    while md and md_pos < maxlength:
+        tmp = '0'  # preload a zero so that immediate mismatches will be caught
+                   # the zero will have no affect otherwise...
+
+        # look for matches
+        while md and md[0] in '0123456789':
+            tmp += md[0]
+            md = md[1:]
+
+        pos = int(tmp)
+        if pos > maxlength:
+            return (maxlength, '%s%s' % (pos - maxlength, md))
+        return (pos, md)
+
+
+def read_calc_variations(read):
+    'see _read_calc_variations'
+    for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq):
+        yield tup
+
+
+def _read_calc_variations(start_pos, cigar, md, seq):
+    '''
+    For each variation, outputs a tuple: (op, pos, seq)
+
+    op  - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR)
+    pos - 0-based position of the variation (relative to reference)
+    seq - the base (or bases) involved in the variation
+          for mismatch or insert, this is the sequence inserted
+          for deletions, this is the reference sequence that was removed
+
+    MD is the mismatch string. Not all aligners include the tag. If your aligner
+    doesn't include this, then you'll need to add it, or use a different function
+    (see: read_calc_mismatches_gen).
+
+    Special care must be used to handle RNAseq reads that cross
+    an exon-exon junction.
+
+    Also: MD is a *really* dumb format that can't be read correctly with
+          a regex. It must be processed in concert with the CIGAR alignment
+          in order to catch all edge cases. Some implementations insert 0's
+          at the end of inserts / deltions / variations to make parsing easier
+          but not everyone follows this. Look at the complex examples: the
+          CIGAR alignment may show an insert, but the MD just shows all matches.
+
+    Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/
+              Also from CCBB actual mappings and manual altered (shortened,
+              made more complex)
+              (doctests included)
+
+    Match/mismatch
+    CIGAR: 36M
+    MD:Z:  1A0C0C0C1T0C0T27
+    MD:Z:  1ACCC1TCT27 (alternative)
+                   1         2
+          123456789012345678901234567890123456
+    ref:  CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG
+           XXXX XXX
+    read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG
+          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+          -ACCC-TCT---------------------------
+    >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG'))
+    [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')]
+
+    Insert
+    CIGAR: 6M1I29M
+    MD:Z: 0C1C0C1C0T0C27
+          C1CC1CTC27 (alt)
+                    1         2
+          123456^789012345678901234567890123456
+    ref:  CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT
+          X XX X|XX
+    read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT
+          MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+          G-GA-GGGG---------------------------
+    >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
+    [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
+    >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
+    [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
+
+
+    Deletion
+    CIGAR: 9M9D27M
+    MD:Z: 2G0A5^ATGATGTCA27
+          2GA5^ATGATGTCA27 (alt)
+    ref:  AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC
+            XX     ^^^^^^^^^
+    read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC
+          MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
+          --TG-----ATGATGTCA---------------------------
+    >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
+    [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')]
+
+
+    Complex
+    CIGAR: 9M9D11M1I15M
+    MD:Z: 2G0A5^ATGATGTCAA26
+    MD:Z: 2G0A5^ATGATGTCA0G26 (alt)
+                   1         2         3         4
+    pos:  123456789012345678901234567890123456789012345
+    ref:  AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC
+            XX     ^^^^^^^^^X          |
+    read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC
+          MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
+          --TG-----ATGATGTCAG----------T---------------
+    >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
+    [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')]
+
+
+    Complex example - inserts aren't separately handled by MD, only visible in CIGAR
+    CIGAR: 14M2D16M3I42M
+    MD:Z:  14^TC58
+                   1         2         3            4         5         6         7
+    pos:  12345678901234567890123456789012^^^345678901234567890123456789012345678901234567
+    ref:  caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct
+                        ^^                |||
+    read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT
+          MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+          --------------tc----------------TGG------------------------------------------
+    >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT'))
+    [(2, 15, 'TC'), (1, 33, 'TGG')]
+
+
+    Complex example 2:
+    CIGAR: 41M3I10M1I5M1I2M2I10M
+    MD:Z:  44C2C6T6T6
+                   1         2         3         4            5             6
+    pos:  12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678
+    ref:  AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
+                                                   |||   X  X   |   X |  ||   X
+    read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
+          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
+          -----------------------------------------tta---A--T---c---A----gt---G------
+
+
+    13M 28M 3I 10M 1I 5M 1I 2M 2I 10M
+    >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
+    [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')]
+
+
+    Splice junction example:
+    CIGAR: 62M100N13M
+    MD:Z:  2T27C44
+                                                                                 1      1
+                   1         2         3         4         5         6           6      7
+    pos:  12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345
+    ref:  CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
+            X                           X
+    read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
+          --G---------------------------T-----------------------------------------------------
+
+    >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC'))
+    [(0, 3, 'G'), (0, 31, 'T')]
+
+
+    Splice junction example 2:
+    CIGAR: 13M100N28M3I10M1I5M1I2M2I10M
+    MD:Z:  44C2C6T6T6
+                                      1         1         1            1             1
+                   1                  2         3         4            5             6
+    pos:  1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
+    ref:  AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
+                                                            |||   X  X   |   X |  ||   X
+    read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
+          MMMMMMMMMMMMM         MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
+          -------------         ----------------------------tta---A--T---c---A----gt---G------
+
+    13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M
+    >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
+    [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
+
+
+    Splice junction example 2A:
+    CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M
+    MD:Z:  9A10^GG22C2C6T6T6
+                                      1         1         1            1             1
+                   1                  2         3         4            5             6
+    pos:  1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
+    ref:  AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
+                                       ^^                   |||   X  X   |   X |  ||   X
+    read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
+          MMMMMMMMMMMMM         MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
+          ---------C---         ----------------------------tta---A--T---c---A----gt---G------
+          .........A...         .......GG...................   ...C..C... ...T. ..  ...T......
+              9    A        10        ^GG             22          C 2C   6   T     6   T   6
+
+    >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
+    [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
+
+    Real Example
+    242_1071_1799_B1
+    CIGAR: 42M10I3M1D9M1D11M
+    MD:Z:  27G16A0^T6C2^T1C9
+                   1         2         3         4                   5         6         7
+    pos:  123456789012345678901234567890123456789012          345678901234567890123456789012345
+    ref:  ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG
+                                     X                          X^      X  ^ X
+    read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG
+          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM
+          ---------------------------A--------------^^^^^^^^^^--CT------G--T-G---------
+
+    >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG',  ))
+    [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')]
+
+
+    Real example 2
+    577_1692_891_A1
+    CIGAR: 34M100N39M2I
+    MD:Z:  3T69
+                                                          1         1         1         1
+                   1         2         3                  4         5         6         7
+    pos:  1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123
+    ref:  GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^
+
+    read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC
+          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM  NNNNN  MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII
+          ---A------------------------------         ---------------------------------------TC
+
+    >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC',  ))
+    [(0, 4, 'A'), (1, 174, 'TC')]
+
+    '''
+
+    ref_pos = start_pos
+    read_pos = 0
+
+    for op, length in cigar:
+        if md and md[0] == '0':
+            md = md[1:]
+        # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
+        if op == 0:  # M
+            # how far in the chunk are we? (do *not* update ref_pos until end)
+            md_pos = 0
+            last = None
+            while md and md_pos < length:
+                if last == (op, length, md):
+                    sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length))
+                    sys.exit(1)
+                last = (op, length, md)
+                # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
+                chunk_size, md = _extract_md_matches(md, length - md_pos)
+                # sys.stderr.write('   -> %s, %s\n' %(chunk_size, md))
+                md_pos += chunk_size
+
+                # look for mismatches
+                while md_pos < length and md and md[0] not in '0123456789^':
+                    yield (op, ref_pos + md_pos, seq[read_pos + md_pos])
+                    md = md[1:]
+
+                    md_pos += 1
+
+            ref_pos += length
+            read_pos += length
+
+        elif op == 1:  # I
+            # nothing in MD about inserts...
+            yield (op, ref_pos, seq[read_pos:read_pos + length])
+            read_pos += length
+
+        elif op == 2:  # D
+            # prefixed with '^' and includes all of the removed bases
+            if md[0] == '^':
+                md = md[1:]
+            yield (op, ref_pos, md[:length])
+            md = md[length:]
+            ref_pos += length
+
+        elif op == 3:  # N
+            ref_pos += length
+
+
+def read_calc_mismatches_gen(ref, read, chrom):
+    start = read.pos
+    ref_pos = 0
+    read_pos = 0
+
+    for op, length in read.cigar:
+        if op == 1:
+            yield ref_pos, op, None
+            read_pos += length
+        elif op == 2:
+            yield ref_pos, op, None
+            ref_pos += length
+        elif op == 3:
+            ref_pos += length
+        elif op == 0:
+            refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length)
+            if not refseq:
+                raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename))
+            cur_pos = start + ref_pos
+            for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()):
+                if refbase != readbase:
+                    yield op, cur_pos, readbase
+                cur_pos += 1
+            ref_pos += length
+            read_pos += length
+        else:
+            raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+
+def read_calc_mismatches_ref(ref, read, chrom):
+    edits = 0
+
+    for op, pos, base in read_calc_mismatches_gen(ref, read, chrom):
+        edits += 1
+
+    return edits
+
+
+__region_cache = {}
+
+
+def region_pos_to_genomic_pos(name, start, cigar):
+    '''
+        converts a junction position to a genomic location given a junction
+        ref name, the junction position, and the cigar alignment.
+
+        returns: (genomic ref, genomic pos, genomic cigar)
+
+    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)])
+    ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)])
+
+    >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)])
+    ('chr1', 1025, [(0, 25), (3, 0), (0, 75)])
+
+    >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)])
+    ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)])
+
+    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)])
+    ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)])
+
+    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
+    ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
+
+    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
+    ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
+
+    >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S'))
+    ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)])
+
+
+    '''
+
+    if name in __region_cache:
+        chrom, fragments = __region_cache[name]
+    else:
+        c1 = name.split(':')
+        chrom = c1[0]
+
+        fragments = []
+        for fragment in c1[1].split(','):
+            s, e = fragment.split('-')
+            fragments.append((int(s), int(e)))
+
+        __region_cache[name] = (chrom, fragments)
+
+    chr_cigar = []
+    chr_start = fragments[0][0]
+
+    read_start = int(start)
+
+    frag_idx = 0
+    frag_start = 0
+    frag_end = 0
+
+    for i, (s, e) in enumerate(fragments):
+        if chr_start + read_start < e:
+            chr_start += read_start
+            frag_idx = i
+            frag_start = s
+            frag_end = e
+            break
+
+        else:
+            chr_start += (e - s)
+            read_start -= (e - s)
+
+    cur_pos = chr_start
+
+    for op, length in cigar:
+        if op in [1, 4, 5]:
+            chr_cigar.append((op, length))
+
+        elif op in [0, 2]:
+            if cur_pos + length <= frag_end:
+                cur_pos += length
+                chr_cigar.append((op, length))
+
+            else:
+                while cur_pos + length > frag_end:
+                    if frag_end - cur_pos > 0:
+                        chr_cigar.append((op, frag_end - cur_pos))
+                        length -= (frag_end - cur_pos)
+                    cur_pos = frag_end
+                    frag_idx += 1
+                    if len(fragments) <= frag_idx:
+                        print 'ERROR converting: ', name, fragments 
+                        return (chrom, 0, chr_cigar)
+                    frag_start, frag_end = fragments[frag_idx]
+                    chr_cigar.append((3, frag_start - cur_pos))
+                    cur_pos = frag_start
+
+                cur_pos = cur_pos + length
+                chr_cigar.append((op, length))
+        else:
+            print "Unsupported CIGAR operation (%s)" % bam_cigar[op]
+            sys.exit(1)
+
+    return (chrom, chr_start, chr_cigar)
+
+
+def is_junction_valid(cigar, min_overlap=4):
+    '''
+        Does the genomic cigar alignment represent a 'good' alignment.
+        Used for checking junction->genome alignments
+
+        1) the alignment must not start at a splice junction
+        2) the alignment must not start or end with an overhang
+        3) the alignment must overhang the splice junction by min_overlap (4)
+
+        |     Exon1       |     Intron     |      Exon2       |
+        |-----------------|oooooooooooooooo|------------------|
+                                            XXXXXXXXXXXXXXXXXXXXXXXX (bad 1)
+      XXXXXXXXXXXXX (bad 2)                           XXXXXXXXXXXXXXXX (bad 2)
+                        XX-----------------XXXXXXXXXXXXXXXXX (bad 3)
+
+    >>> is_junction_valid(cigar_fromstr('1000N40M'))
+    (False, 'Starts at gap (1000N40M)')
+
+    >>> is_junction_valid(cigar_fromstr('100M'))
+    (False, "Doesn't cover junction")
+
+    >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4)
+    (False, "Too short overlap at 3' (100M1000N3M)")
+
+    >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4)
+    (False, "Too short overlap at 5' (2M1000N100M)")
+
+    >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4)
+    (True, '')
+
+    >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4)
+    (True, '')
+
+    >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4)
+    (True, '')
+
+    '''
+    first = True
+    pre_gap = True
+
+    pre_gap_count = 0
+    post_gap_count = 0
+
+    has_gap = False
+
+    for op, length in cigar:
+        # mapping can't start at a gap
+        if first and op == 3:
+            return (False, 'Starts at gap (%s)' % cigar_tostr(cigar))
+        first = False
+
+        if op == 3:
+            pre_gap = False
+            post_gap_count = 0
+            has_gap = True
+
+        elif pre_gap:
+            pre_gap_count += length
+        else:
+            post_gap_count += length
+
+        # mapping must start with more than min_overlap base match
+
+    if not has_gap:
+        return (False, "Doesn't cover junction")
+    elif pre_gap_count < min_overlap:
+        return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar))
+    elif post_gap_count < min_overlap:
+        return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar))
+
+    return True, ''
+
+
+def read_alignment_fragments_gen(read):
+    '''
+    Takes a read and returns the start and end positions for each match/mismatch region.
+    This will let us know where each read alignment "touches" the genome.
+    '''
+
+    for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):
+        yield (start, end)
+
+
+def _read_alignment_fragments_gen(pos, cigar):
+    ''' Test-able version of read_alignment_fragments_gen
+    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M')))
+    [(1, 51)]
+
+    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M')))
+    [(1, 26), (126, 151)]
+
+    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M')))
+    [(1, 21), (22, 26), (126, 136), (136, 146)]
+    '''
+    ref_pos = pos
+
+    for op, length in cigar:
+        if op == 0:
+            yield (ref_pos, ref_pos + length)
+            ref_pos += length
+        elif op == 1:
+            pass
+        elif op == 2:
+            ref_pos += length
+        elif op == 3:
+            ref_pos += length
+        else:
+            raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+
+def read_cigar_at_pos(cigar, qpos, is_del):
+    '''
+    Returns the CIGAR operation for a given read position
+
+    qpos is the 0-based index of the base in the read
+    '''
+    pos = 0
+    returnnext = False
+    for op, length in cigar:
+        if returnnext:
+            return op
+        if op == 0:
+            pos += length
+        elif op == 1:
+            pos += length
+        elif op == 2:
+            pass
+        elif op == 3:
+            pass
+        elif op == 4:
+            pos += length
+        elif op == 5:
+            pass
+        else:
+            raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+        if pos > qpos:
+            return op
+        if is_del and pos == qpos:
+            returnnext = True
+
+    return None
+
+
+def cleancigar(cigar):
+    '''
+    Cleans a CIGAR alignment to remove zero length operations
+
+    >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S')))
+    '83M18S'
+
+    '''
+    newcigar = []
+
+    changed = False
+    zero = False
+
+    for op, size in cigar:
+        if size > 0:
+            if zero and newcigar:
+                if newcigar[-1][0] == op:
+                    newsize = newcigar[-1][1] + size
+                    newcigar[-1] = (op, newsize)
+                zero = 0
+            else:
+                newcigar.append((op, size))
+        else:
+            changed = True
+            zero = True
+
+    if changed:
+        return newcigar
+
+    return None
+
+
+def read_cleancigar(read):
+    '''
+    Replaces the CIGAR string for a read to remove any operations that are zero length.
+
+    This happens to the read in-place
+    '''
+    if read.is_unmapped:
+        return False
+
+    newcigar = []
+
+    newcigar = cleancigar(read.cigar)
+
+    if newcigar:
+        read.cigar = newcigar
+        return True
+
+    return False
+
+
+def read_to_unmapped(read, ref=None):
+    '''
+    Converts a read from mapped to unmapped.
+
+    Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed)
+    '''
+
+    newread = pysam.AlignedRead()
+
+    if ref:
+        tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))]
+
+    newread.is_unmapped = True
+    newread.mapq = 0
+    newread.tlen = 0
+    newread.pos = -1
+    newread.pnext = -1
+    newread.rnext = -1
+    newread.tid = -1
+
+    newread.qname = read.qname
+
+    if read.is_paired:
+        newread.is_paired = True
+
+    if not read.is_unmapped and read.is_reverse:
+        newread.seq = ngsutils.support.revcomp(read.seq)
+        newread.qual = read.qual[::-1]
+    else:        
+        newread.seq = read.seq
+        newread.qual = read.qual
+
+    newread.tags = tags
+
+    return newread
+
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
Binary file ngsutils/bam/__init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bed/__init__.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,282 @@
+import os
+import ngsutils.support.ngs_utils
+import pysam
+
+
+class BedStreamer(object):
+    '''
+    Streams BedRegions from a BED file
+
+    Note - this can only be used once! There is no mechanism to seek the stream.
+    '''
+
+    def __init__(self, fname=None, fileobj=None, quiet=False):
+        if not fname and not fileobj:
+            raise ValueError("You must specify either fname or fileobj!")
+
+        self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        try:
+            while True:
+                line = self.reader.next().strip()
+                if line and line[0] != '#':
+                    cols = line.split('\t')
+                    while len(cols) < 6:
+                        cols.append('')
+
+                    return BedRegion(*cols)
+        except:
+            raise StopIteration
+
+
+
+class BedFile(object):
+    '''
+    BED files are read in their entirety memory, in a series of bins. Each bin
+    is ~100kb in size. Each bin can then be iterated over.
+
+    This is less efficient than using a proper index, but in reality, this
+    usually isn't an issue. However, if the BED file has been Tabix indexed,
+    that index will be used for random access.
+
+    NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile
+    and a BedFileIndex where the file is indexed only if random access is requested.
+    '''
+
+    _bin_const = 100000
+
+    def __init__(self, fname=None, fileobj=None, region=None):
+        self._bins = {}
+        self._bin_list = []
+        self._cur_bin_idx = 0
+        self._cur_bin_pos = 0
+        self._tellpos = 0
+        self._total = 0
+        self._length = 0
+        self.__tabix = None
+
+        self.filename = fname
+
+        if os.path.exists('%s.tbi' % fname):
+            self.__tabix = pysam.Tabixfile(fname)
+
+        if fileobj:
+            self.__readfile(fileobj)
+        elif fname:
+            with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:
+                self.__readfile(fobj)
+        elif region:
+            chrom, startend = region.split(':')
+            if '-' in startend:
+                start, end = [int(x) for x in startend.split('-')]
+            else:
+                start = int(startend)
+                end = start
+            start -= 1
+
+            self.__add_region(BedRegion(chrom, start, end))
+        else:
+            raise ValueError("Must specify either filename, fileobj, or region")
+
+    def __readfile(self, fobj):
+        for line in fobj:
+            line = line.strip()
+            if line and line[0] != '#':
+                cols = line.split('\t')
+                while len(cols) < 6:
+                    cols.append('')
+
+                region = BedRegion(*cols)
+                self.__add_region(region)
+
+        self._bin_list.sort()
+        for bin in self._bins:
+            self._bins[bin].sort()
+
+    def __add_region(self, region):
+        self._total += region.end - region.start
+        self._length += 1
+
+        startbin = region.start / BedFile._bin_const
+        endbin = region.end / BedFile._bin_const
+
+        for bin in xrange(startbin, endbin + 1):
+            if not (region.chrom, bin) in self._bins:
+                self._bin_list.append((region.chrom, bin))
+                self._bins[(region.chrom, bin)] = []
+            self._bins[(region.chrom, bin)].append(region)
+
+    def fetch(self, chrom, start, end, strand=None):
+        '''
+        For TABIX indexed BED files, find all regions w/in a range
+
+        For non-TABIX index BED files, use the calculated bins, and
+        output matching regions
+        '''
+
+        if self.__tabix:
+            for match in self.__tabix.fetch(chrom, start, end):
+                region = BedRegion(*match.split('\t'))
+                if not strand or (strand and region.strand == strand):
+                    yield region
+        else:
+            startbin = start / BedFile._bin_const
+            endbin = end / BedFile._bin_const
+
+            buf = set()
+
+            for bin in xrange(startbin, endbin + 1):
+                if (chrom, bin) in self._bins:
+                    for region in self._bins[(chrom, bin)]:
+                        if strand and strand != region.strand:
+                            continue
+                        if start <= region.start <= end or start <= region.end <= end:
+                            if not region in buf:
+                                yield region
+                                buf.add(region)
+                        elif region.start < start and region.end > end:
+                            if not region in buf:
+                                yield region
+                                buf.add(region)
+
+    def tell(self):
+        return self._tellpos
+
+    def close(self):
+        pass
+
+    @property
+    def length(self):
+        return self._length
+
+    @property
+    def total(self):
+        return self._total
+
+    def __iter__(self):
+        self._cur_bin_idx = 0
+        self._cur_bin_pos = 0
+        self._tellpos = 0
+        return self
+
+    def next(self):
+        if self._cur_bin_idx >= len(self._bin_list):
+            raise StopIteration
+
+        binvals = self._bins[self._bin_list[self._cur_bin_idx]]
+        while self._cur_bin_pos < len(binvals):
+            val = binvals[self._cur_bin_pos]
+            self._cur_bin_pos += 1
+
+            startbin = (val.chrom, val.start / BedFile._bin_const)
+            if startbin == self._bin_list[self._cur_bin_idx]:
+                self._tellpos += 1
+                return val
+
+        self._cur_bin_idx += 1
+        self._cur_bin_pos = 0
+        return self.next()
+
+
+class BedRegion(object):
+    def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):
+        self.chrom = chrom
+        self.start = int(start)
+        self.end = int(end)
+        self.name = name
+
+        if score == '':
+            self.score = 0
+        else:
+            self.score = float(score)
+
+        if strand == '':
+            self.strand = None
+        else:
+            self.strand = strand
+
+        if thickStart == '':
+            self.thickStart = None
+        else:
+            self.thickStart = thickStart
+
+        if thickEnd == '':
+            self.thickEnd = None
+        else:
+            self.thickEnd = thickEnd
+
+        if rgb == '':
+            self.rgb = None
+        else:
+            self.rgb = rgb
+
+        self.extras = args
+
+    def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):
+        cols = []
+        cols.append(self.chrom if chrom is None else chrom)
+        cols.append(self.start if start is None else start)
+        cols.append(self.end if end is None else end)
+        cols.append(self.name if name is None else name)
+        cols.append(self.score if score is None else score)
+        cols.append(self.strand if strand is None else strand)
+        cols.append(self.thickStart if thickStart is None else thickStart)
+        cols.append(self.thickEnd if thickEnd is None else thickEnd)
+        cols.append(self.rgb if rgb is None else rgb)
+
+        for i, val in enumerate(self.extras):
+            if len(args) > i:
+                cols.append(args[i])
+            else:
+                cols.append(val)
+
+        return BedRegion(*cols)
+
+    @property
+    def score_int(self):
+        score = str(self.score)
+        if score[-2:] == '.0':
+            score = score[:-2]
+
+        return score
+
+    def __key(self):
+        return (self.chrom, self.start, self.end, self.strand, self.name)
+
+    def __lt__(self, other):
+        return self.__key() < other.__key()
+
+    def __gt__(self, other):
+        return self.__key() > other.__key()
+
+    def __eq__(self, other):
+        return self.__key() == other.__key()
+
+    def write(self, out):
+        out.write('%s\n' % self)
+
+    def __repr__(self):
+        outcols = []
+
+        if self.rgb:
+            outcols.append(self.rgb)
+        if self.thickEnd or outcols:
+            outcols.append(self.thickEnd if self.thickEnd else self.end)
+        if self.thickStart or outcols:
+            outcols.append(self.thickStart if self.thickStart else self.start)
+        if self.strand or outcols:
+            outcols.append(self.strand)
+        if self.score_int != '' or outcols:
+            outcols.append(self.score_int)
+        if self.name or outcols:
+            outcols.append(self.name)
+
+        outcols.append(self.end)
+        outcols.append(self.start)
+        outcols.append(self.chrom)
+
+        return '\t'. join([str(x) for x in outcols[::-1]])
Binary file ngsutils/bed/__init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/__init__.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,249 @@
+import collections
+import gzip
+import os
+import sys
+import re
+try:
+    from eta import ETA
+except:
+    pass
+
+class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
+    def __repr__(self):
+        if self.comment:
+            return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
+        return '>%s\n%s\n' % (self.name, self.seq)
+
+    def subseq(self, start, end, comment=None):
+        if self.comment:
+            comment = '%s %s' % (self.comment, comment)
+
+        return FASTARead(self.name, comment, self.seq[start:end])
+
+    def clone(self, name=None, comment=None, seq=None):
+        n = name if name else self.name
+        c = comment if comment else self.comment
+        s = seq if seq else self.seq
+
+        return FASTARead(n, c, s)
+
+    def write(self, out):
+        out.write(repr(self))
+
+
+class FASTA(object):
+    def __init__(self, fname=None, fileobj=None, qual=False):
+        self.fname = fname
+        self.qual = qual
+        if fileobj:
+            self.fileobj = fileobj
+        else:
+            if self.fname == '-':
+                self.fileobj = sys.stdin
+            elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
+                self.fileobj = gzip.open(os.path.expanduser(self.fname))
+            else:
+                self.fileobj = open(os.path.expanduser(self.fname))
+
+        if not self.fileobj:
+            raise ValueError("Missing valid filename or fileobj")
+
+    def close(self):
+        if self.fileobj != sys.stdout:
+            self.fileobj.close()
+
+    def tell(self):
+        # always relative to uncompressed...
+        return self.fileobj.tell()
+
+    def seek(self, pos, whence=0):
+        self.fileobj.seek(pos, whence)
+
+    def fetch(self, quiet=False):
+        name = ''
+        comment = ''
+        seq = ''
+
+        if not quiet and self.fname and self.fname != '-':
+            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
+        else:
+            eta = None
+
+        for line in self.fileobj:
+            line = line.strip()
+            if not line:
+                continue
+            if line[0] == '#':
+                continue
+
+            if line[0] == '>':
+                if name and seq:
+                    if eta:
+                        eta.print_status(extra=name)
+                    yield FASTARead(name, comment, seq)
+
+                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
+                name = spl[0]
+                if len(spl) > 1:
+                    comment = spl[1]
+                else:
+                    comment = ''
+                seq = ''
+
+            else:
+                if self.qual:
+                    seq = seq + ' ' + line
+                else:
+                    seq += line
+
+        if name and seq:
+            if eta:
+                eta.print_status(extra=name)
+            yield FASTARead(name, comment, seq)
+
+        if eta:
+            eta.done()
+
+
+def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
+    if fileobj:
+        f = fileobj
+    elif fname == '-':
+        f = sys.stdin
+    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+        f = gzip.open(os.path.expanduser(fname))
+    else:
+        f = open(os.path.expanduser(fname))
+
+    if quiet or fname == '-':
+        eta = None
+    else:
+        eta = ETA(os.stat(fname).st_size, fileobj=f)
+
+    for line in f:
+        if eta:
+            if callback:
+                extra = callback()
+            else:
+                extra = ''
+
+            eta.print_status(extra=extra)
+        yield line
+
+        if done_callback and done_callback():
+                break
+
+    if f != sys.stdin:
+        f.close()
+
+    if eta:
+        eta.done()
+
+
+class Symbolize(object):
+    'Converts strings to symbols - basically a cache of strings'
+    def __init__(self):
+        self.__cache = {}
+
+    def __getitem__(self, k):
+        if not k in self.__cache:
+            self.__cache[k] = k
+
+        return self.__cache[k]
+
+symbols = Symbolize()
+
+_compliments = {
+'a': 't',
+'A': 'T',
+'c': 'g',
+'C': 'G',
+'g': 'c',
+'G': 'C',
+'t': 'a',
+'T': 'A',
+'n': 'n',
+'N': 'N'
+}
+
+
+def revcomp(seq):
+    '''
+    >>> revcomp('ATCGatcg')
+    'cgatCGAT'
+    '''
+    ret = []
+
+    for s in seq:
+        ret.append(_compliments[s])
+
+    ret.reverse()
+    return ''.join(ret)
+
+
+class Counts(object):
+    '''
+    Setup simple binning.  Bins are continuous 0->max.  Values are added to
+    bins and then means / distributions can be calculated.
+    '''
+    def __init__(self):
+        self.bins = []
+
+    def add(self, val):
+        while len(self.bins) <= val:
+            self.bins.append(0)
+        self.bins[val] += 1
+
+    def mean(self):
+        acc = 0
+        count = 0
+
+        for i, val in enumerate(self.bins):
+            acc += (i * val)
+            count += val
+
+        if count > 0:
+            return float(acc) / count
+
+    def max(self):
+        return len(self.bins) - 1
+
+
+def memoize(func):
+    if 'TESTING' in os.environ or 'DEBUG' in os.environ:
+        return func
+
+    __cache = {}
+    def inner(*args, **kwargs):
+        k = (args, tuple(kwargs.iteritems()))
+        if  k not in __cache:
+            __cache[k] = func(*args, **kwargs)
+        return __cache[k]
+
+    inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
+    return inner
+
+
+def quoted_split(s, delim, quote_char='"'):
+    tokens = []
+
+    buf = ""
+    inquote = False
+
+    for c in s:
+        if inquote:
+            buf += c
+            if c == quote_char:
+                inquote = False
+        elif c == delim:
+            tokens.append(buf)
+            buf = ""
+        else:
+            buf += c
+            if c == quote_char:
+                inquote = True
+
+    if buf:
+        tokens.append(buf)
+
+    return tokens
Binary file ngsutils/support/__init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/bgzip.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+'''
+Extract the blocks from a BGZip file.
+
+BAM files are stored as blocks in a bgzip archive. This class
+will load the bgzip archive and output the block information.
+'''
+
+import sys
+import os
+import struct
+
+
+class BGZip(object):
+    def __init__(self, fname):
+        self.fname = fname
+        self.pos = 0
+        self.fileobj = open(self.fname)
+        self.fsize = os.stat(self.fname).st_size
+        self.cur_chunk = 0
+
+        self.cpos = 0
+        self.cdata = 0
+
+    def close(self):
+        self.fileobj.close()
+
+    def seek(self, offset):
+        bgz_offset = offset >> 16
+        chunk_offset = offset & 0xFFFF
+
+        self.fileobj.seek(bgz_offset)
+        self.read_chunk()
+        self.chunk_pos = chunk_offset
+
+    def read(self, amount, whence=0):
+        if whence not in [0, 1]:
+            print "Bad Whence value!: %s" % whence
+            return
+
+        if whence == 0:
+            self.seek(0, 0)
+
+        ### read into chunk, if not enough data in chunk, read next chunk
+        ret = ''
+        while amount and self.pos < self.fsize:
+            if len(self.cdata) - self.cpos < amount:
+                ret += self.cdata[self.cpos:self.cpos + amount]
+                self.cpos += amount
+                return ret
+            else:
+                ret += self.cdata[self.cpos:]
+                amount = amount - len(ret)
+                self.read_chunk()
+        return ret
+
+    def read_chunk(self):
+        self.fileobj.seek(10)
+        id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
+        subpos = 0
+        bsize = 0
+
+        while subpos < xlen:
+            si1, si2, slen = self._read_fields('<BBH')
+            if si1 == 66 and si2 == 67:
+                bsize, = self._read_fields('<H')
+            else:
+                self.fileobj.seek(slen, 1)
+                self.pos += slen
+
+            subpos += 6 + slen
+
+        cdata_size = bsize - xlen - 19
+        self.cdata = self.fileobj.read(cdata_size)  # inflate value
+        self.fileobj.seek(8)
+
+        self.cur_chunk += 1
+        self.cpos = 0
+
+    def dump(self):
+        self.fileobj.seek(0)
+        block_num = 0
+
+        while self.pos < self.fsize:
+            print "[BLOCK %s]" % block_num
+            # read header
+            id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
+            print 'id1: %s' % id1
+            print 'id2: %s' % id2
+            print 'cm: %s' % cm
+            print 'flg: %s' % flg
+            print 'mtime: %s' % mtime
+            print 'xfl: %s' % xfl
+            print 'os: %s' % os
+            print 'xlen: %s' % xlen
+
+            # read subfields
+            subpos = 0
+            bsize = 0
+
+            while subpos < xlen:
+                si1, si2, slen = self._read_fields('<BBH')
+                print '    si1: %s' % si1
+                print '    si1: %s' % si2
+                print '    slen: %s' % slen
+                print '    data: [%s]' % slen
+
+                if si1 == 66 and si2 == 67:
+                    bsize, = self._read_fields('<H')
+                else:
+                    self.fileobj.seek(slen, 1)
+                    self.pos += slen
+
+                subpos += 6 + slen
+
+            cdata_size = bsize - xlen - 19
+
+            print 'bsize: %s' % bsize
+            print 'cdata: [%s]' % (cdata_size)
+
+            self.fileobj.seek(cdata_size, 1)
+            self.pos += cdata_size
+            crc, isize = self._read_fields('<II')
+
+            print "crc: %s" % crc
+            print "isize: %s" % isize
+            # print "POS: %s" % self.pos
+
+            block_num += 1
+
+    def _read_fields(self, field_types):
+        size = struct.calcsize(field_types)
+        self.pos += size
+        return struct.unpack(field_types, self.fileobj.read(size))
+
+if __name__ == '__main__':
+    print BGZip(sys.argv[1]).dump()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/dbsnp.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,141 @@
+'''
+Support package for processing a dbSNP tabix dump from UCSC.
+'''
+
+import pysam
+import collections
+import sys
+from ngsutils.support import revcomp
+
+
+class SNPRecord(collections.namedtuple('SNPRecord', '''bin
+chrom
+chromStart
+chromEnd
+name
+score
+strand
+refNCBI
+refUCSC
+observed
+molType
+clazz
+valid
+avHet
+avHetSE
+func
+locType
+weight
+exceptions
+submitterCount
+submitters
+alleleFreqCount
+alleles
+alleleNs
+alleleFreqs
+bitfields
+''')):
+    __slots__ = ()
+
+    @property
+    def alleles(self):
+        alts = []
+        for alt in self.observed.split('/'):
+            if alt != '-' and self.strand == '-':
+                alt = revcomp(alt)
+
+            alts.append(alt)
+
+        return alts
+
+    @property
+    def snp_length(self):
+        return self.chromEnd - self.chromStart
+
+
+def autotype(ar, length=26):
+    out = []
+    for el in ar:
+        val = None
+        try:
+            val = int(el)
+        except:
+            try:
+                val = float(el)
+            except:
+                val = el
+
+        out.append(val)
+
+    if len(out) < 12:
+        raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.")
+
+    while len(out) < length:
+        out.append(None)
+    return out
+
+
+class DBSNP(object):
+    def __init__(self, fname):
+        self.dbsnp = pysam.Tabixfile(fname)
+        self.asTup = pysam.asTuple()
+
+    def fetch(self, chrom, pos):
+        'Note: pos is 0-based'
+
+        # Note: tabix the command uses 1-based positions, but
+        #       pysam.Tabixfile uses 0-based positions
+
+        for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup):
+            snp = SNPRecord._make(autotype(tup))
+            if snp.chromStart == pos:
+                yield snp
+
+    def close(self):
+        self.dbsnp.close()
+
+    def dump(self, chrom, op, pos, base, snp, exit=True):
+        print
+        print ' ->', op, chrom, pos, base
+        print snp
+        print snp.alleles
+        if exit:
+            sys.exit(1)
+
+    def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
+        for snp in self.fetch(chrom, pos):
+            if not '/' in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
+                # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
+                continue
+
+            if op == 0:
+                if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
+                    return True
+                # elif verbose:
+                #     for alt in snp.alleles:
+                #         if len(alt) > 1:
+                #             self.dump(chrom, op, pos, seq, snp, False)
+
+            elif op == 1:
+                if snp.clazz in ['insertion', 'mixed', 'in-del']:
+                    if seq in snp.alleles:
+                        if len(seq) > 1 and verbose:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        return True
+
+                    if verbose:
+                        if len(seq) > 1:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        else:
+                            for alt in snp.alleles:
+                                if len(alt) > 1:
+                                    self.dump(chrom, op, pos, seq, snp, False)
+
+            elif op == 2:
+                if snp.clazz in ['deletion', 'mixed', 'in-del']:
+                    if '-' in snp.alleles and seq in snp.alleles:
+                        if len(seq) > 1 and verbose:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        return True
+
+        return False
Binary file ngsutils/support/dbsnp.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/llh.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,55 @@
+'''
+Methods for calculating log-likelihoods for nucleotide frequencies
+'''
+import math
+import collections
+from ngsutils.support import memoize
+
+_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
+
+NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
+
+@memoize
+def pseudo_count(N, bg):
+    '''
+    >>> pseudo_count(100, _default_background['A'])
+    3
+    >>> pseudo_count(100, _default_background['C'])
+    2
+    '''
+
+    return bg * math.sqrt(N)
+
+
+def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
+    if pseudo == 'auto':
+        N = A + C + G + T
+        Ap = float(A) + pseudo_count(N, bg['A'])
+        Cp = float(C) + pseudo_count(N, bg['C'])
+        Gp = float(G) + pseudo_count(N, bg['G'])
+        Tp = float(T) + pseudo_count(N, bg['T'])
+    elif pseudo:
+        Ap = float(A) + pseudo
+        Cp = float(C) + pseudo
+        Gp = float(G) + pseudo
+        Tp = float(T) + pseudo
+    else:
+        Ap = float(A)
+        Cp = float(C)
+        Gp = float(G)
+        Tp = float(T)
+
+    Np = Ap + Cp + Gp + Tp
+
+    freqA = Ap / Np
+    freqC = Cp / Np
+    freqG = Gp / Np
+    freqT = Tp / Np
+
+    return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
+
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/ngs_utils.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+"""
+
+Common util classes / functions for the NGS project
+
+"""
+import sys
+import os
+import gzip
+import re
+import collections
+
+
+def format_number(n):
+    '''
+    >>> format_number(1000)
+    '1,000'
+    >>> format_number(1234567)
+    '1,234,567'
+    '''
+    ar = list(str(n))
+    for i in range(len(ar))[::-3][1:]:
+        ar.insert(i + 1, ',')
+    return ''.join(ar)
+
+
+def natural_sort(ar):
+    '''
+    >>> natural_sort('1 3 4 2 5'.split())
+    ['1', '2', '3', '4', '5']
+    >>> natural_sort('1 10 20 2 3 4'.split())
+    ['1', '2', '3', '4', '10', '20']
+    '''
+    to_sort = []
+    for item in ar:
+        spl = re.split('(\d+)', item)
+        l2 = []
+        for el in spl:
+            try:
+                n = int(el)
+            except:
+                n = el
+            l2.append(n)
+        to_sort.append((l2, item))
+
+    to_sort.sort()
+    return [x[1] for x in to_sort]
+
+
+def dictify(values, colnames):
+    """
+    Convert a list of values into a dictionary based upon given column names.
+
+    If the column name starts with an '@', the value is assumed to be a comma
+    separated list.
+
+    If the name starts with a '#', the value is assumed to be an int.
+
+    If the name starts with '@#', the value is assumed to  a comma separated
+    list of ints.
+
+    """
+    d = {}
+    for i in xrange(len(colnames)):
+        key = colnames[i]
+        split = False
+        num = False
+
+        if key[0] == '@':
+            key = key[1:]
+            split = True
+        if key[0] == '#':
+            key = key[1:]
+            num = True
+
+        if i < len(values):
+            if num and split:
+                val = [int(x) for x in values[i].rstrip(',').split(',')]
+            elif num:
+                val = int(values[i])
+            elif split:
+                val = values[i].rstrip(',').split(',')
+            else:
+                val = values[i]
+
+            d[key] = val
+
+        else:
+            d[key] = None
+
+    return d
+
+
+def gzip_aware_open(fname):
+    if fname == '-':
+        f = sys.stdin
+    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+        f = gzip.open(os.path.expanduser(fname))
+    else:
+        f = open(os.path.expanduser(fname))
+    return f
+
+
+class gzip_opener:
+    '''
+    A Python 2.6 class to handle 'with' opening of text files that may
+    or may not be gzip compressed.
+    '''
+    def __init__(self, fname):
+        self.fname = fname
+
+    def __enter__(self):
+        self.f = gzip_aware_open(self.fname)
+        return self.f
+
+    def __exit__(self, type, value, traceback):
+        if self.f != sys.stdin:
+            self.f.close()
+        return False
+
+
+def filenames_to_uniq(names, new_delim='.'):
+    '''
+    Given a set of file names, produce a list of names consisting of the
+    uniq parts of the names. This works from the end of the name.  Chunks of
+    the name are split on '.' and '-'.
+
+    For example:
+        A.foo.bar.txt
+        B.foo.bar.txt
+        returns: ['A','B']
+
+        AA.BB.foo.txt
+        CC.foo.txt
+        returns: ['AA.BB','CC']
+
+    >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
+    ['a', 'b']
+    >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
+    ['a.b', 'c']
+
+    '''
+    name_words = []
+    maxlen = 0
+    for name in names:
+        name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
+        name_words[-1].reverse()
+        if len(name_words[-1]) > maxlen:
+            maxlen = len(name_words[-1])
+
+    common = [False, ] * maxlen
+    for i in xrange(maxlen):
+        last = None
+        same = True
+        for nameword in name_words:
+            if i >= len(nameword):
+                same = False
+                break
+            if not last:
+                last = nameword[i]
+            elif nameword[i] != last:
+                same = False
+                break
+        common[i] = same
+
+    newnames = []
+    for nameword in name_words:
+        nn = []
+        for (i, val) in enumerate(common):
+            if not val and i < len(nameword):
+                nn.append(nameword[i])
+        nn.reverse()
+        newnames.append(new_delim.join(nn))
+    return newnames
+
+
+def parse_args(argv, defaults=None, expected_argc=0):
+    opts = {}
+    if defaults:
+        opts.update(defaults)
+
+    args = []
+
+    i = 0
+    while i < len(argv):
+        if argv[i][0] == '-':
+            arg = argv[i].lstrip('-')
+            if '=' in arg:
+                k, v = arg.split('=', 2)
+                if k in defaults:
+                    if type(defaults[k]) == float:
+                        opts[k] = float(v)
+                    elif type(defaults[k]) == int:
+                        opts[k] = int(v)
+                    else:
+                        opts[k] = v
+            else:
+                opts[arg] = True
+        else:
+            args.append(argv[i])
+        i += 1
+
+    while len(args) < expected_argc:
+        args.append(None)
+    return opts, args
+
+
+class memoize(object):
+    'Simple memoizing decorator to cache results'
+    def __init__(self, func):
+        self.func = func
+        self.cache = {}
+
+    def __call__(self, *args):
+        if not isinstance(args, collections.Hashable):
+            # uncacheable. a list, for instance.
+            # better to not cache than blow up.
+            return self.func(*args)
+
+        if args in self.cache:
+            return self.cache[args]
+        else:
+            value = self.func(*args)
+            self.cache[args] = value
+            return value
Binary file ngsutils/support/ngs_utils.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/regions.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,166 @@
+class RangeMatch(object):
+    '''
+    Simple genomic ranges.  You can define chrom:start-end ranges, then ask if a
+    particular genomic coordinate maps to any of those ranges.  This is less-
+    efficient than an R-Tree, but easier to code.
+    '''
+    def __init__(self, name):
+        self.ranges = {}
+        self.name = name
+
+    def add_range(self, chrom, strand, start, end):
+        if not chrom in self.ranges:
+            self.ranges[chrom] = {}
+
+        bin = start / 100000
+        if not bin in self.ranges[chrom]:
+            self.ranges[chrom][bin] = []
+        self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+        if (end / 100000) != bin:
+            for bin in xrange(bin + 1, (end / 100000) + 1):
+                if not bin in self.ranges[chrom]:
+                    self.ranges[chrom][bin] = []
+                self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+    def get_tag(self, chrom, strand, pos, ignore_strand=False):
+        '''
+        returns (region, is_reverse_orientation)
+        '''
+        if not chrom in self.ranges:
+            return None, False
+        bin = pos / 100000
+        if not bin in self.ranges[chrom]:
+            return None, False
+        for start, end, r_strand in self.ranges[chrom][bin]:
+            if pos >= start and pos <= end:
+                if ignore_strand or strand == r_strand:
+                    return self.name, False
+                return self.name, True
+        return None, False
+
+
+class RegionTagger(object):
+    def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
+        self.regions = []
+        self.counts = {}
+        self.only_first_fragment = only_first_fragment
+
+        coding = RangeMatch('coding')
+        exons = RangeMatch('other-exon')
+        utr_5 = RangeMatch('utr-5')
+        utr_3 = RangeMatch('utr-3')
+        introns = RangeMatch('intron')
+        promoters = RangeMatch('promoter')
+
+        for gene in gtf.genes:
+            if valid_chroms and not gene.chrom in valid_chroms:
+                continue
+            if gene.strand == '+':
+                promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
+            else:
+                promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
+
+            for transcript in gene.transcripts:
+                if transcript.has_cds:
+                    for start, end in transcript.cds:
+                        coding.add_range(gene.chrom, gene.strand, start, end)
+
+                    # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
+                    for s, e in transcript.utr_5:
+                        utr_5.add_range(gene.chrom, gene.strand, s, e)
+                    for s, e in transcript.utr_3:
+                        utr_3.add_range(gene.chrom, gene.strand, s, e)
+
+                last_end = None
+                for start, end in transcript.exons:
+                    if last_end:
+                        introns.add_range(gene.chrom, gene.strand, last_end, start)
+                    exons.add_range(gene.chrom, gene.strand, start, end)
+                    last_end = end
+
+
+        self.regions.append(coding)
+        self.regions.append(utr_5)
+        self.regions.append(utr_3)
+        self.regions.append(exons)
+        self.regions.append(introns)
+        self.regions.append(promoters)
+
+        self.counts['coding'] = 0
+        self.counts['coding-rev'] = 0
+        self.counts['other-exon'] = 0
+        self.counts['utr-5'] = 0
+        self.counts['utr-3'] = 0
+        self.counts['utr-5-rev'] = 0
+        self.counts['utr-3-rev'] = 0
+        self.counts['intron'] = 0
+        self.counts['promoter'] = 0
+        self.counts['other-exon-rev'] = 0
+        self.counts['intron-rev'] = 0
+        self.counts['promoter-rev'] = 0
+        self.counts['junction'] = 0
+        self.counts['intergenic'] = 0
+        self.counts['mitochondrial'] = 0
+
+    def add_read(self, read, chrom):
+        if read.is_unmapped:
+            return
+        
+        if self.only_first_fragment and read.is_paired and not read.is_read1:
+            return
+
+        tag = None
+        is_rev = False
+
+        strand = '-' if read.is_reverse else '+'
+
+        if chrom == 'chrM':
+            tag = 'mitochondrial'
+
+        if not tag:
+            for op, length in read.cigar:
+                if op == 3:
+                    tag = 'junction'
+                    break
+
+        if not tag:
+            for region in self.regions:
+                tag, is_rev = region.get_tag(chrom, strand, read.pos)
+                if tag:
+                    break
+
+        if not tag:
+            tag = 'intergenic'
+
+        if tag:
+            if is_rev:
+                self.counts['%s-rev' % tag] += 1
+            else:
+                self.counts[tag] += 1
+
+        return tag
+
+    def tag_region(self, chrom, start, end, strand):
+        tag = None
+        is_rev = False
+
+        if chrom == 'chrM' or chrom == 'M':
+            tag = 'mitochondrial'
+
+        if not tag:
+            for region in self.regions:
+                tag, is_rev = region.get_tag(chrom, strand, start)
+                if is_rev:
+                    tag = '%s-rev' % tag
+
+                if start != end:
+                    endtag, is_rev = region.get_tag(chrom, strand, end)
+                    if is_rev:
+                        endtag = '%s-rev' % endtag
+
+                if tag and endtag and endtag != tag:
+                    tag = '%s/%s' % (tag, endtag)
+
+        if not tag:
+            tag = 'intergenic'
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/stats.py	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,161 @@
+'''
+various statistical tests and methods...
+'''
+import math
+from ngsutils.support import memoize
+
+def median(vals):
+    '''
+    >>> median([1,2,3])
+    2
+    >>> median([1,2,3,4])
+    2.5
+    '''
+    vals.sort()
+
+    if len(vals) % 2 == 1:
+        return vals[len(vals) / 2]
+    else:
+        a = vals[(len(vals) / 2) - 1]
+        b = vals[(len(vals) / 2)]
+        return float(a + b) / 2
+
+
+def mean_stdev(l):
+    '''
+    >>> mean_stdev([1,2,2,2])
+    (1.75, 0.5)
+    >>> mean_stdev([2,2,2,2])
+    (2.0, 0.0)
+    '''
+
+    acc = 0
+    for el in l:
+        acc += el
+
+    mean = float(acc) / len(l)
+    acc = 0
+    for el in l:
+        acc += (el - mean) ** 2
+
+    if len(l) > 2:
+        stdev = math.sqrt(float(acc) / (len(l) - 1))
+    else:
+        stdev = 0.0
+
+    return (mean, stdev)
+
+
+def counts_median(d):
+    '''
+    Calculate the median from counts stored in a dictionary
+    >>> counts_median({ 1: 4, 2: 1, 3: 4 })
+    2
+    >>> counts_median({ 1: 4, 3: 4 })
+    2
+
+    '''
+    count = 0
+    for k in d:
+        count += d[k]
+
+    if count == 0:
+        return 0
+
+    acc = 0.0
+    last = 0
+    for k in sorted(d):
+        if last:
+            return (last + k) / 2
+        acc += d[k]
+        if acc / count == 0.5:
+            last = k
+        elif acc / count > 0.5:
+            return k
+
+
+def counts_mean_stdev(d):
+    '''
+
+    calc mean / stdev when data is stored as counts in a dictionary
+
+    Ex:
+        { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
+
+    >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
+    (2.0, 1.0)
+
+    '''
+
+    acc = 0
+    count = 0
+    for k in d:
+        acc += k * d[k]
+        count += d[k]
+
+    mean = float(acc) / count
+
+    acc = 0
+    for k in d:
+        acc += (((k - mean) ** 2) * d[k])
+
+    if count > 2:
+        stdev = math.sqrt(float(acc) / (count - 1))
+    else:
+        stdev = 0.0
+
+    return (mean, stdev)
+
+@memoize
+def poisson_prob(x, mean):
+    '''
+        Return the probability that you could get x counts in
+        a Poisson test with a mean value.
+
+        prob(x) = sum(i=1..x){poisson(i)}
+
+        >>> poisson_prob(6,10)
+        0.1300960209527205
+        >>> poisson_prob(8,10)
+        0.33277427882095645
+    '''
+    acc = 0.0
+    for i in xrange(1, x+1):
+        acc += poisson_func(i, mean)
+    return acc
+
+@memoize
+def poisson_func(mu, lambd):
+    '''
+        This is the Poisson distribution function
+        
+        p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
+
+        mu is a count
+        lambd is the mean
+
+        >>> poisson_func(1,10)
+        0.00045399929762484856
+        >>> poisson_func(2,10)
+        0.0022699964881242427
+        >>> poisson_func(3,10)
+        0.007566654960414142
+    '''
+    return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
+
+
+@memoize
+def _factorial(x):
+    '''
+    >>> _factorial(1)
+    1
+    >>> _factorial(2)
+    2
+    >>> _factorial(3)
+    6
+    '''
+    return math.factorial(x)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
Binary file test-data/ngsutils_bam_filter_input1.bam has changed
Binary file test-data/ngsutils_bam_filter_result1.bam has changed
Binary file test-data/ngsutils_bam_filter_result2.bam has changed
Binary file test-data/ngsutils_bam_filter_result3.bam has changed
Binary file test-data/ngsutils_bam_filter_result4.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="pysam" version="0.7.7">
+        <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>