changeset 3:9b9ae5963d3c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 4a3c9f195ba5d899b1a1ce5e80281cdf230f456a
author iuc
date Mon, 23 Oct 2017 13:27:02 -0400
parents 7a68005de299
children aecfe10118ed
files bam_filter.xml filter.py macros.xml ngsutils/__init__.py ngsutils/__init__.pyc ngsutils/bam/__init__.py ngsutils/bam/__init__.pyc ngsutils/bed/__init__.py ngsutils/bed/__init__.pyc ngsutils/support/__init__.py ngsutils/support/__init__.pyc ngsutils/support/bgzip.py ngsutils/support/dbsnp.py ngsutils/support/dbsnp.pyc ngsutils/support/llh.py ngsutils/support/ngs_utils.py ngsutils/support/ngs_utils.pyc ngsutils/support/regions.py ngsutils/support/stats.py
diffstat 18 files changed, 69 insertions(+), 2385 deletions(-) [+]
line wrap: on
line diff
--- a/bam_filter.xml	Sun Nov 27 15:01:21 2016 -0500
+++ b/bam_filter.xml	Mon Oct 23 13:27:02 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.1">
+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@">
     <description>Removes reads from a BAM file based on criteria</description>
     <macros>
         <import>macros.xml</import>
@@ -7,67 +7,61 @@
     <expand macro="stdio" />
     <expand macro="version" />
     <command><![CDATA[
-
-    ## If the tool is executed with no filtering option,
-    ## the default parameters simply copy over the input file
-    if grep -q "\w" ${parameters};
-    then
-        $__tool_directory__/filter.py
-        $infile
-        $outfile
-        `cat ${parameters}`;
-    else
-        cp $infile $outfile;
-    fi
-
-]]>
-    </command>
+## If the tool is executed with no filtering option,
+## the default parameters simply copy over the input file
+if grep -q "\w" '${parameters}'; then
+    '$__tool_directory__/filter.py'
+    '$infile'
+    '$outfile'
+    `cat ${parameters}`;
+else
+    cp '$infile' '$outfile';
+fi
+    ]]></command>
     <configfiles>
-        <configfile name="parameters">
-<![CDATA[
-        #if $minlen:
-            -minlen $minlen
-        #end if
-        #if $maxlen
-            -maxlen $maxlen
-        #end if
-        $mapped
-        $unmapped
-        $properpair
-        $noproperpair
-        #if $mask:
-            -mask ${mask}
-        #end if
-        #if int($uniq) > -1:
-            -uniq
-            #if int($uniq) > 0:
-                $uniq
-            #end if
-        #end if
-        $uniq_start
-        #if $mismatch:
-            -mismatch $mismatch
-        #end if
-        $nosecondary
-        $noqcfail
-        $nopcrdup
-        #if $excludebed:
-            -excludebed ${excludebed} $ignore_strand
-        #end if
-        #if $includebed:
-            -includebed ${includebed} $ignore_strand
-        #end if
-        #if $includeref:
-            -includeref ${includeref}
-        #end if
-        #if $excluderef:
-            -excluderef ${excluderef}
-        #end if
-        #if $maximum_mismatch_ratio
-            -maximum_mismatch_ratio $maximum_mismatch_ratio
-        #end if
-    ]]>
-        </configfile>
+        <configfile name="parameters"><![CDATA[
+#if $minlen:
+    -minlen $minlen
+#end if
+#if $maxlen
+    -maxlen $maxlen
+#end if
+$mapped
+$unmapped
+$properpair
+$noproperpair
+#if $mask:
+    -mask ${mask}
+#end if
+#if int($uniq) > -1:
+    -uniq
+    #if int($uniq) > 0:
+        $uniq
+    #end if
+#end if
+$uniq_start
+#if $mismatch:
+    -mismatch $mismatch
+#end if
+$nosecondary
+$noqcfail
+$nopcrdup
+#if $excludebed:
+    -excludebed ${excludebed} $ignore_strand
+#end if
+#if $includebed:
+    -includebed ${includebed} $ignore_strand
+#end if
+#if $includeref:
+    -includeref ${includeref}
+#end if
+#if $excluderef:
+    -excluderef ${excluderef}
+#end if
+#if $maximum_mismatch_ratio
+    -maximum_mismatch_ratio $maximum_mismatch_ratio
+#end if
+        ]]></configfile>
     </configfiles>
     <inputs>
         <param name="infile" type="data" format="bam" label="Select BAM dataset" />
@@ -129,7 +123,6 @@
         <param argument="-maximum_mismatch_ratio" type="float" value="" optional="True" min="0.0" max="1.0"
             label="Filter by maximum mismatch ratio"
             help="fraction of length"/>
-
     </inputs>
     <outputs>
         <data format="bam" name="outfile" />
@@ -234,6 +227,5 @@
 | -excluderef refname            | Exclude reads mapped to a particular reference  |
 |                                | (e.g. chrM, or _dup chromosomes)                |
 +--------------------------------+-------------------------------------------------+
-]]>
-    </help>
+    ]]></help>
 </tool>
--- a/filter.py	Sun Nov 27 15:01:21 2016 -0500
+++ b/filter.py	Mon Oct 23 13:27:02 2017 -0400
@@ -107,6 +107,7 @@
     The tag type (:i, :f, :Z) is optional.
 
 """
+from __future__ import print_function
 
 import os
 import sys
@@ -118,8 +119,8 @@
 
 
 def usage():
-    print __doc__
-    print """
+    print(__doc__)
+    print("""
 Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria...
 
 Options:
@@ -131,7 +132,7 @@
 
 This will remove all unmapped reads, as well as any reads that have an AS:i
 value less than 1000.
-"""
+""")
     sys.exit(1)
 
 
@@ -760,10 +761,10 @@
             # guess at type...
             try:
                 self.value = int(value)
-            except:
+            except ValueError:
                 try:
                     self.value = float(value)
-                except:
+                except ValueError:
                     self.value = value
 
     def get_value(self, read):
@@ -960,7 +961,7 @@
             outfile = arg
         elif arg[0] == '-':
             if not arg[1:] in _criteria:
-                print "Unknown criterion: %s" % arg
+                print("Unknown criterion: %s" % arg)
                 fail = True
             if crit_args:
                 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
@@ -968,7 +969,7 @@
         elif crit_args:
             crit_args.append(arg)
         else:
-            print "Unknown argument: %s" % arg
+            print("Unknown argument: %s" % arg)
             fail = True
 
     if not fail and crit_args:
@@ -979,11 +980,11 @@
             usage()
 
         if not infile:
-            print "Missing: input bamfile"
+            print("Missing: input bamfile")
         if not outfile:
-            print "Missing: output bamfile"
+            print("Missing: output bamfile")
         if not criteria:
-            print "Missing: filtering criteria"
+            print("Missing: filtering criteria")
         usage()
     else:
         bam_filter(infile, outfile, criteria, failed, verbose)
--- a/macros.xml	Sun Nov 27 15:01:21 2016 -0500
+++ b/macros.xml	Mon Oct 23 13:27:02 2017 -0400
@@ -1,8 +1,8 @@
 <macros>
-    <token name="@WRAPPER_VERSION@">0.5.8</token>
+    <token name="@WRAPPER_VERSION@">0.5.9</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="0.9.1.4">pysam</requirement>
+            <requirement type="package" version="@WRAPPER_VERSION@">ngsutils</requirement>
         </requirements>
     </xml>
     <xml name="version">
Binary file ngsutils/__init__.pyc has changed
--- a/ngsutils/bam/__init__.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,880 +0,0 @@
-import os
-import re
-import sys
-
-import ngsutils.support
-import pysam
-try:
-    from eta import ETA
-except:
-    pass
-
-
-def bam_open(fname, mode='r', *args, **kwargs):
-    if fname.lower()[-4:] == '.bam':
-        return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)
-    return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)
-
-
-def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
-    if not quiet and bam.filename:
-        eta = ETA(os.stat(bam.filename).st_size)
-    else:
-        eta = None
-
-    for pileup in bam.pileup(mask=mask):
-        pos = bam.tell()
-        bgz_offset = pos >> 16
-
-        if not quiet:
-            if callback:
-                eta.print_status(bgz_offset, extra=callback(pileup))
-            else:
-                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))
-
-        yield pileup
-
-    if eta:
-        eta.done()
-
-
-def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
-    '''
-    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
-    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
-    '''
-
-    if os.path.exists('%s.bai' % bam.filename):
-        # This is an indexed file, so it is ref sorted...
-        # Meaning that we should show chrom:pos, instead of read names
-        show_ref_pos = True
-
-    eta = None
-
-    if not ref:
-        if not quiet and bam.filename:
-            eta = ETA(os.stat(bam.filename).st_size)
-
-        for read in bam:
-            pos = bam.tell()
-            bgz_offset = pos >> 16
-
-            if not quiet and eta:
-                if callback:
-                    eta.print_status(bgz_offset, extra=callback(read))
-                elif (show_ref_pos):
-                    if read.tid > -1:
-                        eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
-                    else:
-                        eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
-                else:
-                    eta.print_status(bgz_offset, extra='%s' % read.qname)
-
-            yield read
-
-    else:
-        working_chrom = None
-        if ref in bam.references:
-            working_chrom = ref
-        elif ref[0:3] == 'chr':
-            # compensate for Ensembl vs UCSC ref naming
-            if ref[3:] in bam.references:
-                working_chrom = ref[3:]
-
-        if not working_chrom:
-            raise ValueError('Missing reference: %s' % ref)
-
-        tid = bam.gettid(working_chrom)
-
-        if not start:
-            start = 0
-        if not end:
-            end = bam.lengths[tid]
-
-        if not quiet and bam.filename:
-            eta = ETA(end - start)
-
-        for read in bam.fetch(working_chrom, start, end):
-            if not quiet and eta:
-                if callback:
-                    eta.print_status(read.pos - start, extra=callback(read))
-                else:
-                    eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
-
-            yield read
-
-    if eta:
-        eta.done()
-
-
-def bam_batch_reads(bam, quiet=False):
-    '''
-    Batch mapping for the same reads (qname) together, this way
-    they can all be compared/converted together.
-    '''
-    reads = []
-    last = None
-    for read in bam_iter(bam, quiet=quiet):
-        if last and read.qname != last:
-            yield reads
-            reads = []
-        last = read.qname
-        reads.append(read)
-
-    if reads:
-        yield reads
-
-
-bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']
-bam_cigar_op = {
-    'M': 0,
-    'I': 1,
-    'D': 2,
-    'N': 3,
-    'S': 4,
-    'H': 5,
-    'P': 6,
-    '=': 7,
-    'X': 8,
-}
-
-
-def cigar_fromstr(s):
-    '''
-    >>> cigar_fromstr('10M5I20M')
-    [(0, 10), (1, 5), (0, 20)]
-    >>> cigar_fromstr('1M1I1D1N1S1H1P')
-    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
-    '''
-    ret = []
-    spl = re.split('([0-9]+)', s)[1:]
-    for op, size in zip(spl[1::2], spl[::2]):
-        ret.append((bam_cigar_op[op], int(size)))
-    return ret
-
-
-def cigar_tostr(cigar):
-    '''
-    >>> cigar_tostr(((0, 10), (1, 5), (0, 20)))
-    '10M5I20M'
-    >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)))
-    '1M1I1D1N1S1H1P'
-    '''
-
-    s = ''
-
-    for op, size in cigar:
-        s += '%s%s' % (size, bam_cigar[op])
-
-    return s
-
-
-def cigar_read_len(cigar):
-    '''
-    >>> cigar_read_len(cigar_fromstr('8M'))
-    8
-    >>> cigar_read_len(cigar_fromstr('8M100N8M'))
-    16
-    >>> cigar_read_len(cigar_fromstr('8M10I8M'))
-    26
-    >>> cigar_read_len(cigar_fromstr('8M10D8M'))
-    16
-    '''
-
-    read_pos = 0
-
-    for op, length in cigar:
-        if op == 0:  # M
-            read_pos += length
-        elif op == 1:  # I
-            read_pos += length
-        elif op == 2:  # D
-            pass
-        elif op == 3:  # N
-            pass
-        elif op == 4:  # S
-            read_pos += length
-        elif op == 5:  # H
-            pass
-        elif op == 7:  # =
-            read_pos += length
-        elif op == 8:  # X
-            read_pos += length
-        else:
-            raise ValueError("Unsupported CIGAR operation: %s" % op)
-
-    return read_pos
-
-
-def read_calc_mismatches(read):
-    inserts = 0
-    deletions = 0
-    indels = 0
-    edits = int(read.opt('NM'))
-    #
-    # NM counts the length of indels
-    # We really just care about *if* there is an indel, not the size
-    #
-
-    for op, length in read.cigar:
-        if op == 1:
-            inserts += length
-            indels += 1
-        elif op == 2:
-            deletions += length
-            indels += 1
-
-    return edits - inserts - deletions + indels
-
-
-def _extract_md_matches(md, maxlength):
-    md_pos = 0
-
-    while md and md_pos < maxlength:
-        # preload a zero so that immediate mismatches will be caught
-        # the zero will have no affect otherwise...
-        tmp = '0'
-
-        # look for matches
-        while md and md[0] in '0123456789':
-            tmp += md[0]
-            md = md[1:]
-
-        pos = int(tmp)
-        if pos > maxlength:
-            return (maxlength, '%s%s' % (pos - maxlength, md))
-        return (pos, md)
-
-
-def read_calc_variations(read):
-    'see _read_calc_variations'
-    for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq):
-        yield tup
-
-
-def _read_calc_variations(start_pos, cigar, md, seq):
-    '''
-    For each variation, outputs a tuple: (op, pos, seq)
-
-    op  - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR)
-    pos - 0-based position of the variation (relative to reference)
-    seq - the base (or bases) involved in the variation
-          for mismatch or insert, this is the sequence inserted
-          for deletions, this is the reference sequence that was removed
-
-    MD is the mismatch string. Not all aligners include the tag. If your aligner
-    doesn't include this, then you'll need to add it, or use a different function
-    (see: read_calc_mismatches_gen).
-
-    Special care must be used to handle RNAseq reads that cross
-    an exon-exon junction.
-
-    Also: MD is a *really* dumb format that can't be read correctly with
-          a regex. It must be processed in concert with the CIGAR alignment
-          in order to catch all edge cases. Some implementations insert 0's
-          at the end of inserts / deltions / variations to make parsing easier
-          but not everyone follows this. Look at the complex examples: the
-          CIGAR alignment may show an insert, but the MD just shows all matches.
-
-    Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/
-              Also from CCBB actual mappings and manual altered (shortened,
-              made more complex)
-              (doctests included)
-
-    Match/mismatch
-    CIGAR: 36M
-    MD:Z:  1A0C0C0C1T0C0T27
-    MD:Z:  1ACCC1TCT27 (alternative)
-                   1         2
-          123456789012345678901234567890123456
-    ref:  CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG
-           XXXX XXX
-    read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG
-          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
-          -ACCC-TCT---------------------------
-    >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG'))
-    [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')]
-
-    Insert
-    CIGAR: 6M1I29M
-    MD:Z: 0C1C0C1C0T0C27
-          C1CC1CTC27 (alt)
-                    1         2
-          123456^789012345678901234567890123456
-    ref:  CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT
-          X XX X|XX
-    read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT
-          MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
-          G-GA-GGGG---------------------------
-    >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
-    [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
-    >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
-    [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
-
-
-    Deletion
-    CIGAR: 9M9D27M
-    MD:Z: 2G0A5^ATGATGTCA27
-          2GA5^ATGATGTCA27 (alt)
-    ref:  AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC
-            XX     ^^^^^^^^^
-    read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC
-          MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
-          --TG-----ATGATGTCA---------------------------
-    >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
-    [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')]
-
-
-    Complex
-    CIGAR: 9M9D11M1I15M
-    MD:Z: 2G0A5^ATGATGTCAA26
-    MD:Z: 2G0A5^ATGATGTCA0G26 (alt)
-                   1         2         3         4
-    pos:  123456789012345678901234567890123456789012345
-    ref:  AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC
-            XX     ^^^^^^^^^X          |
-    read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC
-          MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
-          --TG-----ATGATGTCAG----------T---------------
-    >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
-    [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')]
-
-
-    Complex example - inserts aren't separately handled by MD, only visible in CIGAR
-    CIGAR: 14M2D16M3I42M
-    MD:Z:  14^TC58
-                   1         2         3            4         5         6         7
-    pos:  12345678901234567890123456789012^^^345678901234567890123456789012345678901234567
-    ref:  caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct
-                        ^^                |||
-    read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT
-          MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
-          --------------tc----------------TGG------------------------------------------
-    >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT'))
-    [(2, 15, 'TC'), (1, 33, 'TGG')]
-
-
-    Complex example 2:
-    CIGAR: 41M3I10M1I5M1I2M2I10M
-    MD:Z:  44C2C6T6T6
-                   1         2         3         4            5             6
-    pos:  12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678
-    ref:  AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
-                                                   |||   X  X   |   X |  ||   X
-    read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
-          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
-          -----------------------------------------tta---A--T---c---A----gt---G------
-
-
-    13M 28M 3I 10M 1I 5M 1I 2M 2I 10M
-    >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
-    [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')]
-
-
-    Splice junction example:
-    CIGAR: 62M100N13M
-    MD:Z:  2T27C44
-                                                                                 1      1
-                   1         2         3         4         5         6           6      7
-    pos:  12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345
-    ref:  CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
-            X                           X
-    read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
-          --G---------------------------T-----------------------------------------------------
-
-    >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC'))
-    [(0, 3, 'G'), (0, 31, 'T')]
-
-
-    Splice junction example 2:
-    CIGAR: 13M100N28M3I10M1I5M1I2M2I10M
-    MD:Z:  44C2C6T6T6
-                                      1         1         1            1             1
-                   1                  2         3         4            5             6
-    pos:  1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
-    ref:  AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
-                                                            |||   X  X   |   X |  ||   X
-    read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
-          MMMMMMMMMMMMM         MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
-          -------------         ----------------------------tta---A--T---c---A----gt---G------
-
-    13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M
-    >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
-    [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
-
-
-    Splice junction example 2A:
-    CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M
-    MD:Z:  9A10^GG22C2C6T6T6
-                                      1         1         1            1             1
-                   1                  2         3         4            5             6
-    pos:  1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
-    ref:  AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
-                                       ^^                   |||   X  X   |   X |  ||   X
-    read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
-          MMMMMMMMMMMMM         MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
-          ---------C---         ----------------------------tta---A--T---c---A----gt---G------
-          .........A...         .......GG...................   ...C..C... ...T. ..  ...T......
-              9    A        10        ^GG             22          C 2C   6   T     6   T   6
-
-    >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
-    [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
-
-    Real Example
-    242_1071_1799_B1
-    CIGAR: 42M10I3M1D9M1D11M
-    MD:Z:  27G16A0^T6C2^T1C9
-                   1         2         3         4                   5         6         7
-    pos:  123456789012345678901234567890123456789012          345678901234567890123456789012345
-    ref:  ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG
-                                     X                          X^      X  ^ X
-    read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG
-          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM
-          ---------------------------A--------------^^^^^^^^^^--CT------G--T-G---------
-
-    >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG',  ))
-    [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')]
-
-
-    Real example 2
-    577_1692_891_A1
-    CIGAR: 34M100N39M2I
-    MD:Z:  3T69
-                                                          1         1         1         1
-                   1         2         3                  4         5         6         7
-    pos:  1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123
-    ref:  GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^
-
-    read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC
-          MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM  NNNNN  MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII
-          ---A------------------------------         ---------------------------------------TC
-
-    >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC',  ))
-    [(0, 4, 'A'), (1, 174, 'TC')]
-
-    '''
-
-    ref_pos = start_pos
-    read_pos = 0
-
-    for op, length in cigar:
-        if md and md[0] == '0':
-            md = md[1:]
-        # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
-        if op == 0:  # M
-            # how far in the chunk are we? (do *not* update ref_pos until end)
-            md_pos = 0
-            last = None
-            while md and md_pos < length:
-                if last == (op, length, md):
-                    sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length))
-                    sys.exit(1)
-                last = (op, length, md)
-                # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
-                chunk_size, md = _extract_md_matches(md, length - md_pos)
-                # sys.stderr.write('   -> %s, %s\n' %(chunk_size, md))
-                md_pos += chunk_size
-
-                # look for mismatches
-                while md_pos < length and md and md[0] not in '0123456789^':
-                    yield (op, ref_pos + md_pos, seq[read_pos + md_pos])
-                    md = md[1:]
-
-                    md_pos += 1
-
-            ref_pos += length
-            read_pos += length
-
-        elif op == 1:  # I
-            # nothing in MD about inserts...
-            yield (op, ref_pos, seq[read_pos:read_pos + length])
-            read_pos += length
-
-        elif op == 2:  # D
-            # prefixed with '^' and includes all of the removed bases
-            if md[0] == '^':
-                md = md[1:]
-            yield (op, ref_pos, md[:length])
-            md = md[length:]
-            ref_pos += length
-
-        elif op == 3:  # N
-            ref_pos += length
-
-
-def read_calc_mismatches_gen(ref, read, chrom):
-    start = read.pos
-    ref_pos = 0
-    read_pos = 0
-
-    for op, length in read.cigar:
-        if op == 1:
-            yield ref_pos, op, None
-            read_pos += length
-        elif op == 2:
-            yield ref_pos, op, None
-            ref_pos += length
-        elif op == 3:
-            ref_pos += length
-        elif op == 0:
-            refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length)
-            if not refseq:
-                raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename))
-            cur_pos = start + ref_pos
-            for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()):
-                if refbase != readbase:
-                    yield op, cur_pos, readbase
-                cur_pos += 1
-            ref_pos += length
-            read_pos += length
-        else:
-            raise ValueError("Unsupported CIGAR operation: %s" % op)
-
-
-def read_calc_mismatches_ref(ref, read, chrom):
-    edits = 0
-
-    for op, pos, base in read_calc_mismatches_gen(ref, read, chrom):
-        edits += 1
-
-    return edits
-
-
-__region_cache = {}
-
-
-def region_pos_to_genomic_pos(name, start, cigar):
-    '''
-        converts a junction position to a genomic location given a junction
-        ref name, the junction position, and the cigar alignment.
-
-        returns: (genomic ref, genomic pos, genomic cigar)
-
-    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)])
-    ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)])
-
-    >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)])
-    ('chr1', 1025, [(0, 25), (3, 0), (0, 75)])
-
-    >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)])
-    ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)])
-
-    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)])
-    ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)])
-
-    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
-    ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
-
-    >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
-    ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
-
-    >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S'))
-    ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)])
-
-
-    '''
-
-    if name in __region_cache:
-        chrom, fragments = __region_cache[name]
-    else:
-        c1 = name.split(':')
-        chrom = c1[0]
-
-        fragments = []
-        for fragment in c1[1].split(','):
-            s, e = fragment.split('-')
-            fragments.append((int(s), int(e)))
-
-        __region_cache[name] = (chrom, fragments)
-
-    chr_cigar = []
-    chr_start = fragments[0][0]
-
-    read_start = int(start)
-
-    frag_idx = 0
-    frag_start = 0
-    frag_end = 0
-
-    for i, (s, e) in enumerate(fragments):
-        if chr_start + read_start < e:
-            chr_start += read_start
-            frag_idx = i
-            frag_start = s
-            frag_end = e
-            break
-
-        else:
-            chr_start += (e - s)
-            read_start -= (e - s)
-
-    cur_pos = chr_start
-
-    for op, length in cigar:
-        if op in [1, 4, 5]:
-            chr_cigar.append((op, length))
-
-        elif op in [0, 2]:
-            if cur_pos + length <= frag_end:
-                cur_pos += length
-                chr_cigar.append((op, length))
-
-            else:
-                while cur_pos + length > frag_end:
-                    if frag_end - cur_pos > 0:
-                        chr_cigar.append((op, frag_end - cur_pos))
-                        length -= (frag_end - cur_pos)
-                    cur_pos = frag_end
-                    frag_idx += 1
-                    if len(fragments) <= frag_idx:
-                        print 'ERROR converting: ', name, fragments
-                        return (chrom, 0, chr_cigar)
-                    frag_start, frag_end = fragments[frag_idx]
-                    chr_cigar.append((3, frag_start - cur_pos))
-                    cur_pos = frag_start
-
-                cur_pos = cur_pos + length
-                chr_cigar.append((op, length))
-        else:
-            print "Unsupported CIGAR operation (%s)" % bam_cigar[op]
-            sys.exit(1)
-
-    return (chrom, chr_start, chr_cigar)
-
-
-def is_junction_valid(cigar, min_overlap=4):
-    '''
-        Does the genomic cigar alignment represent a 'good' alignment.
-        Used for checking junction->genome alignments
-
-        1) the alignment must not start at a splice junction
-        2) the alignment must not start or end with an overhang
-        3) the alignment must overhang the splice junction by min_overlap (4)
-
-        |     Exon1       |     Intron     |      Exon2       |
-        |-----------------|oooooooooooooooo|------------------|
-                                            XXXXXXXXXXXXXXXXXXXXXXXX (bad 1)
-      XXXXXXXXXXXXX (bad 2)                           XXXXXXXXXXXXXXXX (bad 2)
-                        XX-----------------XXXXXXXXXXXXXXXXX (bad 3)
-
-    >>> is_junction_valid(cigar_fromstr('1000N40M'))
-    (False, 'Starts at gap (1000N40M)')
-
-    >>> is_junction_valid(cigar_fromstr('100M'))
-    (False, "Doesn't cover junction")
-
-    >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4)
-    (False, "Too short overlap at 3' (100M1000N3M)")
-
-    >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4)
-    (False, "Too short overlap at 5' (2M1000N100M)")
-
-    >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4)
-    (True, '')
-
-    >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4)
-    (True, '')
-
-    >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4)
-    (True, '')
-
-    '''
-    first = True
-    pre_gap = True
-
-    pre_gap_count = 0
-    post_gap_count = 0
-
-    has_gap = False
-
-    for op, length in cigar:
-        # mapping can't start at a gap
-        if first and op == 3:
-            return (False, 'Starts at gap (%s)' % cigar_tostr(cigar))
-        first = False
-
-        if op == 3:
-            pre_gap = False
-            post_gap_count = 0
-            has_gap = True
-
-        elif pre_gap:
-            pre_gap_count += length
-        else:
-            post_gap_count += length
-
-        # mapping must start with more than min_overlap base match
-
-    if not has_gap:
-        return (False, "Doesn't cover junction")
-    elif pre_gap_count < min_overlap:
-        return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar))
-    elif post_gap_count < min_overlap:
-        return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar))
-
-    return True, ''
-
-
-def read_alignment_fragments_gen(read):
-    '''
-    Takes a read and returns the start and end positions for each match/mismatch region.
-    This will let us know where each read alignment "touches" the genome.
-    '''
-
-    for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):
-        yield (start, end)
-
-
-def _read_alignment_fragments_gen(pos, cigar):
-    ''' Test-able version of read_alignment_fragments_gen
-    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M')))
-    [(1, 51)]
-
-    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M')))
-    [(1, 26), (126, 151)]
-
-    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M')))
-    [(1, 21), (22, 26), (126, 136), (136, 146)]
-    '''
-    ref_pos = pos
-
-    for op, length in cigar:
-        if op == 0:
-            yield (ref_pos, ref_pos + length)
-            ref_pos += length
-        elif op == 1:
-            pass
-        elif op == 2:
-            ref_pos += length
-        elif op == 3:
-            ref_pos += length
-        else:
-            raise ValueError("Unsupported CIGAR operation: %s" % op)
-
-
-def read_cigar_at_pos(cigar, qpos, is_del):
-    '''
-    Returns the CIGAR operation for a given read position
-
-    qpos is the 0-based index of the base in the read
-    '''
-    pos = 0
-    returnnext = False
-    for op, length in cigar:
-        if returnnext:
-            return op
-        if op == 0:
-            pos += length
-        elif op == 1:
-            pos += length
-        elif op == 2:
-            pass
-        elif op == 3:
-            pass
-        elif op == 4:
-            pos += length
-        elif op == 5:
-            pass
-        else:
-            raise ValueError("Unsupported CIGAR operation: %s" % op)
-
-        if pos > qpos:
-            return op
-        if is_del and pos == qpos:
-            returnnext = True
-
-    return None
-
-
-def cleancigar(cigar):
-    '''
-    Cleans a CIGAR alignment to remove zero length operations
-
-    >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S')))
-    '83M18S'
-
-    '''
-    newcigar = []
-
-    changed = False
-    zero = False
-
-    for op, size in cigar:
-        if size > 0:
-            if zero and newcigar:
-                if newcigar[-1][0] == op:
-                    newsize = newcigar[-1][1] + size
-                    newcigar[-1] = (op, newsize)
-                zero = 0
-            else:
-                newcigar.append((op, size))
-        else:
-            changed = True
-            zero = True
-
-    if changed:
-        return newcigar
-
-    return None
-
-
-def read_cleancigar(read):
-    '''
-    Replaces the CIGAR string for a read to remove any operations that are zero length.
-
-    This happens to the read in-place
-    '''
-    if read.is_unmapped:
-        return False
-
-    newcigar = []
-
-    newcigar = cleancigar(read.cigar)
-
-    if newcigar:
-        read.cigar = newcigar
-        return True
-
-    return False
-
-
-def read_to_unmapped(read, ref=None):
-    '''
-    Converts a read from mapped to unmapped.
-
-    Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed)
-    '''
-
-    newread = pysam.AlignedRead()
-
-    if ref:
-        tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))]
-
-    newread.is_unmapped = True
-    newread.mapq = 0
-    newread.tlen = 0
-    newread.pos = -1
-    newread.pnext = -1
-    newread.rnext = -1
-    newread.tid = -1
-
-    newread.qname = read.qname
-
-    if read.is_paired:
-        newread.is_paired = True
-
-    if not read.is_unmapped and read.is_reverse:
-        newread.seq = ngsutils.support.revcomp(read.seq)
-        newread.qual = read.qual[::-1]
-    else:
-        newread.seq = read.seq
-        newread.qual = read.qual
-
-    newread.tags = tags
-
-    return newread
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
Binary file ngsutils/bam/__init__.pyc has changed
--- a/ngsutils/bed/__init__.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,282 +0,0 @@
-import os
-
-import ngsutils.support.ngs_utils
-import pysam
-
-
-class BedStreamer(object):
-    '''
-    Streams BedRegions from a BED file
-
-    Note - this can only be used once! There is no mechanism to seek the stream.
-    '''
-
-    def __init__(self, fname=None, fileobj=None, quiet=False):
-        if not fname and not fileobj:
-            raise ValueError("You must specify either fname or fileobj!")
-
-        self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        try:
-            while True:
-                line = self.reader.next().strip()
-                if line and line[0] != '#':
-                    cols = line.split('\t')
-                    while len(cols) < 6:
-                        cols.append('')
-
-                    return BedRegion(*cols)
-        except:
-            raise StopIteration
-
-
-class BedFile(object):
-    '''
-    BED files are read in their entirety memory, in a series of bins. Each bin
-    is ~100kb in size. Each bin can then be iterated over.
-
-    This is less efficient than using a proper index, but in reality, this
-    usually isn't an issue. However, if the BED file has been Tabix indexed,
-    that index will be used for random access.
-
-    NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile
-    and a BedFileIndex where the file is indexed only if random access is requested.
-    '''
-
-    _bin_const = 100000
-
-    def __init__(self, fname=None, fileobj=None, region=None):
-        self._bins = {}
-        self._bin_list = []
-        self._cur_bin_idx = 0
-        self._cur_bin_pos = 0
-        self._tellpos = 0
-        self._total = 0
-        self._length = 0
-        self.__tabix = None
-
-        self.filename = fname
-
-        if os.path.exists('%s.tbi' % fname):
-            self.__tabix = pysam.Tabixfile(fname)
-
-        if fileobj:
-            self.__readfile(fileobj)
-        elif fname:
-            with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:
-                self.__readfile(fobj)
-        elif region:
-            chrom, startend = region.split(':')
-            if '-' in startend:
-                start, end = [int(x) for x in startend.split('-')]
-            else:
-                start = int(startend)
-                end = start
-            start -= 1
-
-            self.__add_region(BedRegion(chrom, start, end))
-        else:
-            raise ValueError("Must specify either filename, fileobj, or region")
-
-    def __readfile(self, fobj):
-        for line in fobj:
-            line = line.strip()
-            if line and line[0] != '#':
-                cols = line.split('\t')
-                while len(cols) < 6:
-                    cols.append('')
-
-                region = BedRegion(*cols)
-                self.__add_region(region)
-
-        self._bin_list.sort()
-        for bin in self._bins:
-            self._bins[bin].sort()
-
-    def __add_region(self, region):
-        self._total += region.end - region.start
-        self._length += 1
-
-        startbin = region.start / BedFile._bin_const
-        endbin = region.end / BedFile._bin_const
-
-        for bin in xrange(startbin, endbin + 1):
-            if not (region.chrom, bin) in self._bins:
-                self._bin_list.append((region.chrom, bin))
-                self._bins[(region.chrom, bin)] = []
-            self._bins[(region.chrom, bin)].append(region)
-
-    def fetch(self, chrom, start, end, strand=None):
-        '''
-        For TABIX indexed BED files, find all regions w/in a range
-
-        For non-TABIX index BED files, use the calculated bins, and
-        output matching regions
-        '''
-
-        if self.__tabix:
-            for match in self.__tabix.fetch(chrom, start, end):
-                region = BedRegion(*match.split('\t'))
-                if not strand or (strand and region.strand == strand):
-                    yield region
-        else:
-            startbin = start / BedFile._bin_const
-            endbin = end / BedFile._bin_const
-
-            buf = set()
-
-            for bin in xrange(startbin, endbin + 1):
-                if (chrom, bin) in self._bins:
-                    for region in self._bins[(chrom, bin)]:
-                        if strand and strand != region.strand:
-                            continue
-                        if start <= region.start <= end or start <= region.end <= end:
-                            if region not in buf:
-                                yield region
-                                buf.add(region)
-                        elif region.start < start and region.end > end:
-                            if region not in buf:
-                                yield region
-                                buf.add(region)
-
-    def tell(self):
-        return self._tellpos
-
-    def close(self):
-        pass
-
-    @property
-    def length(self):
-        return self._length
-
-    @property
-    def total(self):
-        return self._total
-
-    def __iter__(self):
-        self._cur_bin_idx = 0
-        self._cur_bin_pos = 0
-        self._tellpos = 0
-        return self
-
-    def next(self):
-        if self._cur_bin_idx >= len(self._bin_list):
-            raise StopIteration
-
-        binvals = self._bins[self._bin_list[self._cur_bin_idx]]
-        while self._cur_bin_pos < len(binvals):
-            val = binvals[self._cur_bin_pos]
-            self._cur_bin_pos += 1
-
-            startbin = (val.chrom, val.start / BedFile._bin_const)
-            if startbin == self._bin_list[self._cur_bin_idx]:
-                self._tellpos += 1
-                return val
-
-        self._cur_bin_idx += 1
-        self._cur_bin_pos = 0
-        return self.next()
-
-
-class BedRegion(object):
-    def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):
-        self.chrom = chrom
-        self.start = int(start)
-        self.end = int(end)
-        self.name = name
-
-        if score == '':
-            self.score = 0
-        else:
-            self.score = float(score)
-
-        if strand == '':
-            self.strand = None
-        else:
-            self.strand = strand
-
-        if thickStart == '':
-            self.thickStart = None
-        else:
-            self.thickStart = thickStart
-
-        if thickEnd == '':
-            self.thickEnd = None
-        else:
-            self.thickEnd = thickEnd
-
-        if rgb == '':
-            self.rgb = None
-        else:
-            self.rgb = rgb
-
-        self.extras = args
-
-    def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):
-        cols = []
-        cols.append(self.chrom if chrom is None else chrom)
-        cols.append(self.start if start is None else start)
-        cols.append(self.end if end is None else end)
-        cols.append(self.name if name is None else name)
-        cols.append(self.score if score is None else score)
-        cols.append(self.strand if strand is None else strand)
-        cols.append(self.thickStart if thickStart is None else thickStart)
-        cols.append(self.thickEnd if thickEnd is None else thickEnd)
-        cols.append(self.rgb if rgb is None else rgb)
-
-        for i, val in enumerate(self.extras):
-            if len(args) > i:
-                cols.append(args[i])
-            else:
-                cols.append(val)
-
-        return BedRegion(*cols)
-
-    @property
-    def score_int(self):
-        score = str(self.score)
-        if score[-2:] == '.0':
-            score = score[:-2]
-
-        return score
-
-    def __key(self):
-        return (self.chrom, self.start, self.end, self.strand, self.name)
-
-    def __lt__(self, other):
-        return self.__key() < other.__key()
-
-    def __gt__(self, other):
-        return self.__key() > other.__key()
-
-    def __eq__(self, other):
-        return self.__key() == other.__key()
-
-    def write(self, out):
-        out.write('%s\n' % self)
-
-    def __repr__(self):
-        outcols = []
-
-        if self.rgb:
-            outcols.append(self.rgb)
-        if self.thickEnd or outcols:
-            outcols.append(self.thickEnd if self.thickEnd else self.end)
-        if self.thickStart or outcols:
-            outcols.append(self.thickStart if self.thickStart else self.start)
-        if self.strand or outcols:
-            outcols.append(self.strand)
-        if self.score_int != '' or outcols:
-            outcols.append(self.score_int)
-        if self.name or outcols:
-            outcols.append(self.name)
-
-        outcols.append(self.end)
-        outcols.append(self.start)
-        outcols.append(self.chrom)
-
-        return '\t'. join([str(x) for x in outcols[::-1]])
Binary file ngsutils/bed/__init__.pyc has changed
--- a/ngsutils/support/__init__.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,254 +0,0 @@
-import collections
-import gzip
-import os
-import re
-import sys
-try:
-    from eta import ETA
-except:
-    pass
-
-
-class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
-    def __repr__(self):
-        if self.comment:
-            return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
-        return '>%s\n%s\n' % (self.name, self.seq)
-
-    def subseq(self, start, end, comment=None):
-        if self.comment:
-            comment = '%s %s' % (self.comment, comment)
-
-        return FASTARead(self.name, comment, self.seq[start:end])
-
-    def clone(self, name=None, comment=None, seq=None):
-        n = name if name else self.name
-        c = comment if comment else self.comment
-        s = seq if seq else self.seq
-
-        return FASTARead(n, c, s)
-
-    def write(self, out):
-        out.write(repr(self))
-
-
-class FASTA(object):
-    def __init__(self, fname=None, fileobj=None, qual=False):
-        self.fname = fname
-        self.qual = qual
-        if fileobj:
-            self.fileobj = fileobj
-        else:
-            if self.fname == '-':
-                self.fileobj = sys.stdin
-            elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
-                self.fileobj = gzip.open(os.path.expanduser(self.fname))
-            else:
-                self.fileobj = open(os.path.expanduser(self.fname))
-
-        if not self.fileobj:
-            raise ValueError("Missing valid filename or fileobj")
-
-    def close(self):
-        if self.fileobj != sys.stdout:
-            self.fileobj.close()
-
-    def tell(self):
-        # always relative to uncompressed...
-        return self.fileobj.tell()
-
-    def seek(self, pos, whence=0):
-        self.fileobj.seek(pos, whence)
-
-    def fetch(self, quiet=False):
-        name = ''
-        comment = ''
-        seq = ''
-
-        if not quiet and self.fname and self.fname != '-':
-            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
-        else:
-            eta = None
-
-        for line in self.fileobj:
-            line = line.strip()
-            if not line:
-                continue
-            if line[0] == '#':
-                continue
-
-            if line[0] == '>':
-                if name and seq:
-                    if eta:
-                        eta.print_status(extra=name)
-                    yield FASTARead(name, comment, seq)
-
-                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
-                name = spl[0]
-                if len(spl) > 1:
-                    comment = spl[1]
-                else:
-                    comment = ''
-                seq = ''
-
-            else:
-                if self.qual:
-                    seq = seq + ' ' + line
-                else:
-                    seq += line
-
-        if name and seq:
-            if eta:
-                eta.print_status(extra=name)
-            yield FASTARead(name, comment, seq)
-
-        if eta:
-            eta.done()
-
-
-def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
-    if fileobj:
-        f = fileobj
-    elif fname == '-':
-        f = sys.stdin
-    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
-        f = gzip.open(os.path.expanduser(fname))
-    else:
-        f = open(os.path.expanduser(fname))
-
-    if quiet or fname == '-':
-        eta = None
-    else:
-        eta = ETA(os.stat(fname).st_size, fileobj=f)
-
-    for line in f:
-        if eta:
-            if callback:
-                extra = callback()
-            else:
-                extra = ''
-
-            eta.print_status(extra=extra)
-        yield line
-
-        if done_callback and done_callback():
-                break
-
-    if f != sys.stdin:
-        f.close()
-
-    if eta:
-        eta.done()
-
-
-class Symbolize(object):
-    'Converts strings to symbols - basically a cache of strings'
-
-    def __init__(self):
-        self.__cache = {}
-
-    def __getitem__(self, k):
-        if k not in self.__cache:
-            self.__cache[k] = k
-
-        return self.__cache[k]
-
-
-symbols = Symbolize()
-
-_compliments = {
-    'a': 't',
-    'A': 'T',
-    'c': 'g',
-    'C': 'G',
-    'g': 'c',
-    'G': 'C',
-    't': 'a',
-    'T': 'A',
-    'n': 'n',
-    'N': 'N'
-}
-
-
-def revcomp(seq):
-    '''
-    >>> revcomp('ATCGatcg')
-    'cgatCGAT'
-    '''
-    ret = []
-
-    for s in seq:
-        ret.append(_compliments[s])
-
-    ret.reverse()
-    return ''.join(ret)
-
-
-class Counts(object):
-    '''
-    Setup simple binning.  Bins are continuous 0->max.  Values are added to
-    bins and then means / distributions can be calculated.
-    '''
-
-    def __init__(self):
-        self.bins = []
-
-    def add(self, val):
-        while len(self.bins) <= val:
-            self.bins.append(0)
-        self.bins[val] += 1
-
-    def mean(self):
-        acc = 0
-        count = 0
-
-        for i, val in enumerate(self.bins):
-            acc += (i * val)
-            count += val
-
-        if count > 0:
-            return float(acc) / count
-
-    def max(self):
-        return len(self.bins) - 1
-
-
-def memoize(func):
-    if 'TESTING' in os.environ or 'DEBUG' in os.environ:
-        return func
-
-    __cache = {}
-
-    def inner(*args, **kwargs):
-        k = (args, tuple(kwargs.iteritems()))
-        if k not in __cache:
-            __cache[k] = func(*args, **kwargs)
-        return __cache[k]
-
-    inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
-    return inner
-
-
-def quoted_split(s, delim, quote_char='"'):
-    tokens = []
-
-    buf = ""
-    inquote = False
-
-    for c in s:
-        if inquote:
-            buf += c
-            if c == quote_char:
-                inquote = False
-        elif c == delim:
-            tokens.append(buf)
-            buf = ""
-        else:
-            buf += c
-            if c == quote_char:
-                inquote = True
-
-    if buf:
-        tokens.append(buf)
-
-    return tokens
Binary file ngsutils/support/__init__.pyc has changed
--- a/ngsutils/support/bgzip.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,138 +0,0 @@
-#!/usr/bin/env python
-'''
-Extract the blocks from a BGZip file.
-
-BAM files are stored as blocks in a bgzip archive. This class
-will load the bgzip archive and output the block information.
-'''
-
-import os
-import struct
-import sys
-
-
-class BGZip(object):
-    def __init__(self, fname):
-        self.fname = fname
-        self.pos = 0
-        self.fileobj = open(self.fname)
-        self.fsize = os.stat(self.fname).st_size
-        self.cur_chunk = 0
-
-        self.cpos = 0
-        self.cdata = 0
-
-    def close(self):
-        self.fileobj.close()
-
-    def seek(self, offset):
-        bgz_offset = offset >> 16
-        chunk_offset = offset & 0xFFFF
-
-        self.fileobj.seek(bgz_offset)
-        self.read_chunk()
-        self.chunk_pos = chunk_offset
-
-    def read(self, amount, whence=0):
-        if whence not in [0, 1]:
-            print "Bad Whence value!: %s" % whence
-            return
-
-        if whence == 0:
-            self.seek(0, 0)
-
-        # read into chunk, if not enough data in chunk, read next chunk
-        ret = ''
-        while amount and self.pos < self.fsize:
-            if len(self.cdata) - self.cpos < amount:
-                ret += self.cdata[self.cpos:self.cpos + amount]
-                self.cpos += amount
-                return ret
-            else:
-                ret += self.cdata[self.cpos:]
-                amount = amount - len(ret)
-                self.read_chunk()
-        return ret
-
-    def read_chunk(self):
-        self.fileobj.seek(10)
-        id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
-        subpos = 0
-        bsize = 0
-
-        while subpos < xlen:
-            si1, si2, slen = self._read_fields('<BBH')
-            if si1 == 66 and si2 == 67:
-                bsize, = self._read_fields('<H')
-            else:
-                self.fileobj.seek(slen, 1)
-                self.pos += slen
-
-            subpos += 6 + slen
-
-        cdata_size = bsize - xlen - 19
-        self.cdata = self.fileobj.read(cdata_size)  # inflate value
-        self.fileobj.seek(8)
-
-        self.cur_chunk += 1
-        self.cpos = 0
-
-    def dump(self):
-        self.fileobj.seek(0)
-        block_num = 0
-
-        while self.pos < self.fsize:
-            print "[BLOCK %s]" % block_num
-            # read header
-            id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
-            print 'id1: %s' % id1
-            print 'id2: %s' % id2
-            print 'cm: %s' % cm
-            print 'flg: %s' % flg
-            print 'mtime: %s' % mtime
-            print 'xfl: %s' % xfl
-            print 'os: %s' % os
-            print 'xlen: %s' % xlen
-
-            # read subfields
-            subpos = 0
-            bsize = 0
-
-            while subpos < xlen:
-                si1, si2, slen = self._read_fields('<BBH')
-                print '    si1: %s' % si1
-                print '    si1: %s' % si2
-                print '    slen: %s' % slen
-                print '    data: [%s]' % slen
-
-                if si1 == 66 and si2 == 67:
-                    bsize, = self._read_fields('<H')
-                else:
-                    self.fileobj.seek(slen, 1)
-                    self.pos += slen
-
-                subpos += 6 + slen
-
-            cdata_size = bsize - xlen - 19
-
-            print 'bsize: %s' % bsize
-            print 'cdata: [%s]' % (cdata_size)
-
-            self.fileobj.seek(cdata_size, 1)
-            self.pos += cdata_size
-            crc, isize = self._read_fields('<II')
-
-            print "crc: %s" % crc
-            print "isize: %s" % isize
-            # print "POS: %s" % self.pos
-
-            block_num += 1
-
-    def _read_fields(self, field_types):
-        size = struct.calcsize(field_types)
-        self.pos += size
-        return struct.unpack(field_types, self.fileobj.read(size))
-
-
-if __name__ == '__main__':
-    print BGZip(sys.argv[1]).dump()
--- a/ngsutils/support/dbsnp.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,142 +0,0 @@
-'''
-Support package for processing a dbSNP tabix dump from UCSC.
-'''
-
-import collections
-import sys
-
-import pysam
-from ngsutils.support import revcomp
-
-
-class SNPRecord(collections.namedtuple('SNPRecord', '''bin
-chrom
-chromStart
-chromEnd
-name
-score
-strand
-refNCBI
-refUCSC
-observed
-molType
-clazz
-valid
-avHet
-avHetSE
-func
-locType
-weight
-exceptions
-submitterCount
-submitters
-alleleFreqCount
-alleles
-alleleNs
-alleleFreqs
-bitfields
-''')):
-    __slots__ = ()
-
-    @property
-    def alleles(self):
-        alts = []
-        for alt in self.observed.split('/'):
-            if alt != '-' and self.strand == '-':
-                alt = revcomp(alt)
-
-            alts.append(alt)
-
-        return alts
-
-    @property
-    def snp_length(self):
-        return self.chromEnd - self.chromStart
-
-
-def autotype(ar, length=26):
-    out = []
-    for el in ar:
-        val = None
-        try:
-            val = int(el)
-        except:
-            try:
-                val = float(el)
-            except:
-                val = el
-
-        out.append(val)
-
-    if len(out) < 12:
-        raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.")
-
-    while len(out) < length:
-        out.append(None)
-    return out
-
-
-class DBSNP(object):
-    def __init__(self, fname):
-        self.dbsnp = pysam.Tabixfile(fname)
-        self.asTup = pysam.asTuple()
-
-    def fetch(self, chrom, pos):
-        'Note: pos is 0-based'
-
-        # Note: tabix the command uses 1-based positions, but
-        #       pysam.Tabixfile uses 0-based positions
-
-        for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup):
-            snp = SNPRecord._make(autotype(tup))
-            if snp.chromStart == pos:
-                yield snp
-
-    def close(self):
-        self.dbsnp.close()
-
-    def dump(self, chrom, op, pos, base, snp, exit=True):
-        print
-        print ' ->', op, chrom, pos, base
-        print snp
-        print snp.alleles
-        if exit:
-            sys.exit(1)
-
-    def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
-        for snp in self.fetch(chrom, pos):
-            if '/' not in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
-                # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
-                continue
-
-            if op == 0:
-                if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
-                    return True
-                # elif verbose:
-                #     for alt in snp.alleles:
-                #         if len(alt) > 1:
-                #             self.dump(chrom, op, pos, seq, snp, False)
-
-            elif op == 1:
-                if snp.clazz in ['insertion', 'mixed', 'in-del']:
-                    if seq in snp.alleles:
-                        if len(seq) > 1 and verbose:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        return True
-
-                    if verbose:
-                        if len(seq) > 1:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        else:
-                            for alt in snp.alleles:
-                                if len(alt) > 1:
-                                    self.dump(chrom, op, pos, seq, snp, False)
-
-            elif op == 2:
-                if snp.clazz in ['deletion', 'mixed', 'in-del']:
-                    if '-' in snp.alleles and seq in snp.alleles:
-                        if len(seq) > 1 and verbose:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        return True
-
-        return False
Binary file ngsutils/support/dbsnp.pyc has changed
--- a/ngsutils/support/llh.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,56 +0,0 @@
-'''
-Methods for calculating log-likelihoods for nucleotide frequencies
-'''
-import collections
-import math
-
-from ngsutils.support import memoize
-
-_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
-
-NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
-
-
-@memoize
-def pseudo_count(N, bg):
-    '''
-    >>> pseudo_count(100, _default_background['A'])
-    3
-    >>> pseudo_count(100, _default_background['C'])
-    2
-    '''
-
-    return bg * math.sqrt(N)
-
-
-def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
-    if pseudo == 'auto':
-        N = A + C + G + T
-        Ap = float(A) + pseudo_count(N, bg['A'])
-        Cp = float(C) + pseudo_count(N, bg['C'])
-        Gp = float(G) + pseudo_count(N, bg['G'])
-        Tp = float(T) + pseudo_count(N, bg['T'])
-    elif pseudo:
-        Ap = float(A) + pseudo
-        Cp = float(C) + pseudo
-        Gp = float(G) + pseudo
-        Tp = float(T) + pseudo
-    else:
-        Ap = float(A)
-        Cp = float(C)
-        Gp = float(G)
-        Tp = float(T)
-
-    Np = Ap + Cp + Gp + Tp
-
-    freqA = Ap / Np
-    freqC = Cp / Np
-    freqG = Gp / Np
-    freqT = Tp / Np
-
-    return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
--- a/ngsutils/support/ngs_utils.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,225 +0,0 @@
-#!/usr/bin/env python
-"""
-Common util classes / functions for the NGS project
-"""
-import collections
-import gzip
-import os
-import re
-import sys
-
-
-def format_number(n):
-    '''
-    >>> format_number(1000)
-    '1,000'
-    >>> format_number(1234567)
-    '1,234,567'
-    '''
-    ar = list(str(n))
-    for i in range(len(ar))[::-3][1:]:
-        ar.insert(i + 1, ',')
-    return ''.join(ar)
-
-
-def natural_sort(ar):
-    '''
-    >>> natural_sort('1 3 4 2 5'.split())
-    ['1', '2', '3', '4', '5']
-    >>> natural_sort('1 10 20 2 3 4'.split())
-    ['1', '2', '3', '4', '10', '20']
-    '''
-    to_sort = []
-    for item in ar:
-        spl = re.split('(\d+)', item)
-        l2 = []
-        for el in spl:
-            try:
-                n = int(el)
-            except:
-                n = el
-            l2.append(n)
-        to_sort.append((l2, item))
-
-    to_sort.sort()
-    return [x[1] for x in to_sort]
-
-
-def dictify(values, colnames):
-    """
-    Convert a list of values into a dictionary based upon given column names.
-
-    If the column name starts with an '@', the value is assumed to be a comma
-    separated list.
-
-    If the name starts with a '#', the value is assumed to be an int.
-
-    If the name starts with '@#', the value is assumed to  a comma separated
-    list of ints.
-
-    """
-    d = {}
-    for i in xrange(len(colnames)):
-        key = colnames[i]
-        split = False
-        num = False
-
-        if key[0] == '@':
-            key = key[1:]
-            split = True
-        if key[0] == '#':
-            key = key[1:]
-            num = True
-
-        if i < len(values):
-            if num and split:
-                val = [int(x) for x in values[i].rstrip(',').split(',')]
-            elif num:
-                val = int(values[i])
-            elif split:
-                val = values[i].rstrip(',').split(',')
-            else:
-                val = values[i]
-
-            d[key] = val
-
-        else:
-            d[key] = None
-
-    return d
-
-
-def gzip_aware_open(fname):
-    if fname == '-':
-        f = sys.stdin
-    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
-        f = gzip.open(os.path.expanduser(fname))
-    else:
-        f = open(os.path.expanduser(fname))
-    return f
-
-
-class gzip_opener:
-    '''
-    A Python 2.6 class to handle 'with' opening of text files that may
-    or may not be gzip compressed.
-    '''
-
-    def __init__(self, fname):
-        self.fname = fname
-
-    def __enter__(self):
-        self.f = gzip_aware_open(self.fname)
-        return self.f
-
-    def __exit__(self, type, value, traceback):
-        if self.f != sys.stdin:
-            self.f.close()
-        return False
-
-
-def filenames_to_uniq(names, new_delim='.'):
-    '''
-    Given a set of file names, produce a list of names consisting of the
-    uniq parts of the names. This works from the end of the name.  Chunks of
-    the name are split on '.' and '-'.
-
-    For example:
-        A.foo.bar.txt
-        B.foo.bar.txt
-        returns: ['A','B']
-
-        AA.BB.foo.txt
-        CC.foo.txt
-        returns: ['AA.BB','CC']
-
-    >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
-    ['a', 'b']
-    >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
-    ['a.b', 'c']
-
-    '''
-    name_words = []
-    maxlen = 0
-    for name in names:
-        name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
-        name_words[-1].reverse()
-        if len(name_words[-1]) > maxlen:
-            maxlen = len(name_words[-1])
-
-    common = [False, ] * maxlen
-    for i in xrange(maxlen):
-        last = None
-        same = True
-        for nameword in name_words:
-            if i >= len(nameword):
-                same = False
-                break
-            if not last:
-                last = nameword[i]
-            elif nameword[i] != last:
-                same = False
-                break
-        common[i] = same
-
-    newnames = []
-    for nameword in name_words:
-        nn = []
-        for (i, val) in enumerate(common):
-            if not val and i < len(nameword):
-                nn.append(nameword[i])
-        nn.reverse()
-        newnames.append(new_delim.join(nn))
-    return newnames
-
-
-def parse_args(argv, defaults=None, expected_argc=0):
-    opts = {}
-    if defaults:
-        opts.update(defaults)
-
-    args = []
-
-    i = 0
-    while i < len(argv):
-        if argv[i][0] == '-':
-            arg = argv[i].lstrip('-')
-            if '=' in arg:
-                k, v = arg.split('=', 2)
-                if k in defaults:
-                    if type(defaults[k]) == float:
-                        opts[k] = float(v)
-                    elif type(defaults[k]) == int:
-                        opts[k] = int(v)
-                    else:
-                        opts[k] = v
-            else:
-                opts[arg] = True
-        else:
-            args.append(argv[i])
-        i += 1
-
-    while len(args) < expected_argc:
-        args.append(None)
-    return opts, args
-
-
-class memoize(object):
-    'Simple memoizing decorator to cache results'
-
-    def __init__(self, func):
-        self.func = func
-        self.cache = {}
-
-    def __call__(self, *args):
-        if not isinstance(args, collections.Hashable):
-            # uncacheable. a list, for instance.
-            # better to not cache than blow up.
-            return self.func(*args)
-
-        if args in self.cache:
-            return self.cache[args]
-        else:
-            value = self.func(*args)
-            self.cache[args] = value
-            return value
Binary file ngsutils/support/ngs_utils.pyc has changed
--- a/ngsutils/support/regions.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,166 +0,0 @@
-class RangeMatch(object):
-    '''
-    Simple genomic ranges.  You can define chrom:start-end ranges, then ask if a
-    particular genomic coordinate maps to any of those ranges.  This is less-
-    efficient than an R-Tree, but easier to code.
-    '''
-
-    def __init__(self, name):
-        self.ranges = {}
-        self.name = name
-
-    def add_range(self, chrom, strand, start, end):
-        if chrom not in self.ranges:
-            self.ranges[chrom] = {}
-
-        bin = start / 100000
-        if bin not in self.ranges[chrom]:
-            self.ranges[chrom][bin] = []
-        self.ranges[chrom][bin].insert(0, (start, end, strand))
-
-        if (end / 100000) != bin:
-            for bin in xrange(bin + 1, (end / 100000) + 1):
-                if bin not in self.ranges[chrom]:
-                    self.ranges[chrom][bin] = []
-                self.ranges[chrom][bin].insert(0, (start, end, strand))
-
-    def get_tag(self, chrom, strand, pos, ignore_strand=False):
-        '''
-        returns (region, is_reverse_orientation)
-        '''
-        if chrom not in self.ranges:
-            return None, False
-        bin = pos / 100000
-        if bin not in self.ranges[chrom]:
-            return None, False
-        for start, end, r_strand in self.ranges[chrom][bin]:
-            if pos >= start and pos <= end:
-                if ignore_strand or strand == r_strand:
-                    return self.name, False
-                return self.name, True
-        return None, False
-
-
-class RegionTagger(object):
-    def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
-        self.regions = []
-        self.counts = {}
-        self.only_first_fragment = only_first_fragment
-
-        coding = RangeMatch('coding')
-        exons = RangeMatch('other-exon')
-        utr_5 = RangeMatch('utr-5')
-        utr_3 = RangeMatch('utr-3')
-        introns = RangeMatch('intron')
-        promoters = RangeMatch('promoter')
-
-        for gene in gtf.genes:
-            if valid_chroms and gene.chrom not in valid_chroms:
-                continue
-            if gene.strand == '+':
-                promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
-            else:
-                promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
-
-            for transcript in gene.transcripts:
-                if transcript.has_cds:
-                    for start, end in transcript.cds:
-                        coding.add_range(gene.chrom, gene.strand, start, end)
-
-                    # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
-                    for s, e in transcript.utr_5:
-                        utr_5.add_range(gene.chrom, gene.strand, s, e)
-                    for s, e in transcript.utr_3:
-                        utr_3.add_range(gene.chrom, gene.strand, s, e)
-
-                last_end = None
-                for start, end in transcript.exons:
-                    if last_end:
-                        introns.add_range(gene.chrom, gene.strand, last_end, start)
-                    exons.add_range(gene.chrom, gene.strand, start, end)
-                    last_end = end
-
-        self.regions.append(coding)
-        self.regions.append(utr_5)
-        self.regions.append(utr_3)
-        self.regions.append(exons)
-        self.regions.append(introns)
-        self.regions.append(promoters)
-
-        self.counts['coding'] = 0
-        self.counts['coding-rev'] = 0
-        self.counts['other-exon'] = 0
-        self.counts['utr-5'] = 0
-        self.counts['utr-3'] = 0
-        self.counts['utr-5-rev'] = 0
-        self.counts['utr-3-rev'] = 0
-        self.counts['intron'] = 0
-        self.counts['promoter'] = 0
-        self.counts['other-exon-rev'] = 0
-        self.counts['intron-rev'] = 0
-        self.counts['promoter-rev'] = 0
-        self.counts['junction'] = 0
-        self.counts['intergenic'] = 0
-        self.counts['mitochondrial'] = 0
-
-    def add_read(self, read, chrom):
-        if read.is_unmapped:
-            return
-
-        if self.only_first_fragment and read.is_paired and not read.is_read1:
-            return
-
-        tag = None
-        is_rev = False
-
-        strand = '-' if read.is_reverse else '+'
-
-        if chrom == 'chrM':
-            tag = 'mitochondrial'
-
-        if not tag:
-            for op, length in read.cigar:
-                if op == 3:
-                    tag = 'junction'
-                    break
-
-        if not tag:
-            for region in self.regions:
-                tag, is_rev = region.get_tag(chrom, strand, read.pos)
-                if tag:
-                    break
-
-        if not tag:
-            tag = 'intergenic'
-
-        if tag:
-            if is_rev:
-                self.counts['%s-rev' % tag] += 1
-            else:
-                self.counts[tag] += 1
-
-        return tag
-
-    def tag_region(self, chrom, start, end, strand):
-        tag = None
-        is_rev = False
-
-        if chrom == 'chrM' or chrom == 'M':
-            tag = 'mitochondrial'
-
-        if not tag:
-            for region in self.regions:
-                tag, is_rev = region.get_tag(chrom, strand, start)
-                if is_rev:
-                    tag = '%s-rev' % tag
-
-                if start != end:
-                    endtag, is_rev = region.get_tag(chrom, strand, end)
-                    if is_rev:
-                        endtag = '%s-rev' % endtag
-
-                if tag and endtag and endtag != tag:
-                    tag = '%s/%s' % (tag, endtag)
-
-        if not tag:
-            tag = 'intergenic'
--- a/ngsutils/support/stats.py	Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,166 +0,0 @@
-'''
-various statistical tests and methods...
-'''
-import math
-
-from ngsutils.support import memoize
-
-
-def median(vals):
-    '''
-    >>> median([1,2,3])
-    2
-    >>> median([1,2,3,4])
-    2.5
-    '''
-    vals.sort()
-
-    if len(vals) % 2 == 1:
-        return vals[len(vals) / 2]
-    else:
-        a = vals[(len(vals) / 2) - 1]
-        b = vals[(len(vals) / 2)]
-        return float(a + b) / 2
-
-
-def mean_stdev(l):
-    '''
-    >>> mean_stdev([1,2,2,2])
-    (1.75, 0.5)
-    >>> mean_stdev([2,2,2,2])
-    (2.0, 0.0)
-    '''
-
-    acc = 0
-    for el in l:
-        acc += el
-
-    mean = float(acc) / len(l)
-    acc = 0
-    for el in l:
-        acc += (el - mean) ** 2
-
-    if len(l) > 2:
-        stdev = math.sqrt(float(acc) / (len(l) - 1))
-    else:
-        stdev = 0.0
-
-    return (mean, stdev)
-
-
-def counts_median(d):
-    '''
-    Calculate the median from counts stored in a dictionary
-    >>> counts_median({ 1: 4, 2: 1, 3: 4 })
-    2
-    >>> counts_median({ 1: 4, 3: 4 })
-    2
-
-    '''
-    count = 0
-    for k in d:
-        count += d[k]
-
-    if count == 0:
-        return 0
-
-    acc = 0.0
-    last = 0
-    for k in sorted(d):
-        if last:
-            return (last + k) / 2
-        acc += d[k]
-        if acc / count == 0.5:
-            last = k
-        elif acc / count > 0.5:
-            return k
-
-
-def counts_mean_stdev(d):
-    '''
-
-    calc mean / stdev when data is stored as counts in a dictionary
-
-    Ex:
-        { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
-
-    >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
-    (2.0, 1.0)
-
-    '''
-
-    acc = 0
-    count = 0
-    for k in d:
-        acc += k * d[k]
-        count += d[k]
-
-    mean = float(acc) / count
-
-    acc = 0
-    for k in d:
-        acc += (((k - mean) ** 2) * d[k])
-
-    if count > 2:
-        stdev = math.sqrt(float(acc) / (count - 1))
-    else:
-        stdev = 0.0
-
-    return (mean, stdev)
-
-
-@memoize
-def poisson_prob(x, mean):
-    '''
-        Return the probability that you could get x counts in
-        a Poisson test with a mean value.
-
-        prob(x) = sum(i=1..x){poisson(i)}
-
-        >>> poisson_prob(6,10)
-        0.1300960209527205
-        >>> poisson_prob(8,10)
-        0.33277427882095645
-    '''
-    acc = 0.0
-    for i in xrange(1, x + 1):
-        acc += poisson_func(i, mean)
-    return acc
-
-
-@memoize
-def poisson_func(mu, lambd):
-    '''
-        This is the Poisson distribution function
-
-        p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
-
-        mu is a count
-        lambd is the mean
-
-        >>> poisson_func(1,10)
-        0.00045399929762484856
-        >>> poisson_func(2,10)
-        0.0022699964881242427
-        >>> poisson_func(3,10)
-        0.007566654960414142
-    '''
-    return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
-
-
-@memoize
-def _factorial(x):
-    '''
-    >>> _factorial(1)
-    1
-    >>> _factorial(2)
-    2
-    >>> _factorial(3)
-    6
-    '''
-    return math.factorial(x)
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()