Repository 'ngsutils_bam_filter'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/ngsutils_bam_filter

Changeset 3:9b9ae5963d3c (2017-10-23)
Previous changeset 2:7a68005de299 (2016-11-27) Next changeset 4:aecfe10118ed (2017-11-17)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 4a3c9f195ba5d899b1a1ce5e80281cdf230f456a
modified:
bam_filter.xml
filter.py
macros.xml
removed:
ngsutils/__init__.py
ngsutils/__init__.pyc
ngsutils/bam/__init__.py
ngsutils/bam/__init__.pyc
ngsutils/bed/__init__.py
ngsutils/bed/__init__.pyc
ngsutils/support/__init__.py
ngsutils/support/__init__.pyc
ngsutils/support/bgzip.py
ngsutils/support/dbsnp.py
ngsutils/support/dbsnp.pyc
ngsutils/support/llh.py
ngsutils/support/ngs_utils.py
ngsutils/support/ngs_utils.pyc
ngsutils/support/regions.py
ngsutils/support/stats.py
b
diff -r 7a68005de299 -r 9b9ae5963d3c bam_filter.xml
--- a/bam_filter.xml Sun Nov 27 15:01:21 2016 -0500
+++ b/bam_filter.xml Mon Oct 23 13:27:02 2017 -0400
[
@@ -1,4 +1,4 @@
-<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.1">
+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@">
     <description>Removes reads from a BAM file based on criteria</description>
     <macros>
         <import>macros.xml</import>
@@ -7,67 +7,61 @@
     <expand macro="stdio" />
     <expand macro="version" />
     <command><![CDATA[
-
-    ## If the tool is executed with no filtering option,
-    ## the default parameters simply copy over the input file
-    if grep -q "\w" ${parameters};
-    then
-        $__tool_directory__/filter.py
-        $infile
-        $outfile
-        `cat ${parameters}`;
-    else
-        cp $infile $outfile;
-    fi
-
-]]>
-    </command>
+## If the tool is executed with no filtering option,
+## the default parameters simply copy over the input file
+if grep -q "\w" '${parameters}'; then
+    '$__tool_directory__/filter.py'
+    '$infile'
+    '$outfile'
+    `cat ${parameters}`;
+else
+    cp '$infile' '$outfile';
+fi
+    ]]></command>
     <configfiles>
-        <configfile name="parameters">
-<![CDATA[
-        #if $minlen:
-            -minlen $minlen
-        #end if
-        #if $maxlen
-            -maxlen $maxlen
-        #end if
-        $mapped
-        $unmapped
-        $properpair
-        $noproperpair
-        #if $mask:
-            -mask ${mask}
-        #end if
-        #if int($uniq) > -1:
-            -uniq
-            #if int($uniq) > 0:
-                $uniq
-            #end if
-        #end if
-        $uniq_start
-        #if $mismatch:
-            -mismatch $mismatch
-        #end if
-        $nosecondary
-        $noqcfail
-        $nopcrdup
-        #if $excludebed:
-            -excludebed ${excludebed} $ignore_strand
-        #end if
-        #if $includebed:
-            -includebed ${includebed} $ignore_strand
-        #end if
-        #if $includeref:
-            -includeref ${includeref}
-        #end if
-        #if $excluderef:
-            -excluderef ${excluderef}
-        #end if
-        #if $maximum_mismatch_ratio
-            -maximum_mismatch_ratio $maximum_mismatch_ratio
-        #end if
-    ]]>
-        </configfile>
+        <configfile name="parameters"><![CDATA[
+#if $minlen:
+    -minlen $minlen
+#end if
+#if $maxlen
+    -maxlen $maxlen
+#end if
+$mapped
+$unmapped
+$properpair
+$noproperpair
+#if $mask:
+    -mask ${mask}
+#end if
+#if int($uniq) > -1:
+    -uniq
+    #if int($uniq) > 0:
+        $uniq
+    #end if
+#end if
+$uniq_start
+#if $mismatch:
+    -mismatch $mismatch
+#end if
+$nosecondary
+$noqcfail
+$nopcrdup
+#if $excludebed:
+    -excludebed ${excludebed} $ignore_strand
+#end if
+#if $includebed:
+    -includebed ${includebed} $ignore_strand
+#end if
+#if $includeref:
+    -includeref ${includeref}
+#end if
+#if $excluderef:
+    -excluderef ${excluderef}
+#end if
+#if $maximum_mismatch_ratio
+    -maximum_mismatch_ratio $maximum_mismatch_ratio
+#end if
+        ]]></configfile>
     </configfiles>
     <inputs>
         <param name="infile" type="data" format="bam" label="Select BAM dataset" />
@@ -129,7 +123,6 @@
         <param argument="-maximum_mismatch_ratio" type="float" value="" optional="True" min="0.0" max="1.0"
             label="Filter by maximum mismatch ratio"
             help="fraction of length"/>
-
     </inputs>
     <outputs>
         <data format="bam" name="outfile" />
@@ -234,6 +227,5 @@
 | -excluderef refname            | Exclude reads mapped to a particular reference  |
 |                                | (e.g. chrM, or _dup chromosomes)                |
 +--------------------------------+-------------------------------------------------+
-]]>
-    </help>
+    ]]></help>
 </tool>
b
diff -r 7a68005de299 -r 9b9ae5963d3c filter.py
--- a/filter.py Sun Nov 27 15:01:21 2016 -0500
+++ b/filter.py Mon Oct 23 13:27:02 2017 -0400
[
@@ -107,6 +107,7 @@
     The tag type (:i, :f, :Z) is optional.
 
 """
+from __future__ import print_function
 
 import os
 import sys
@@ -118,8 +119,8 @@
 
 
 def usage():
-    print __doc__
-    print """
+    print(__doc__)
+    print("""
 Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria...
 
 Options:
@@ -131,7 +132,7 @@
 
 This will remove all unmapped reads, as well as any reads that have an AS:i
 value less than 1000.
-"""
+""")
     sys.exit(1)
 
 
@@ -760,10 +761,10 @@
             # guess at type...
             try:
                 self.value = int(value)
-            except:
+            except ValueError:
                 try:
                     self.value = float(value)
-                except:
+                except ValueError:
                     self.value = value
 
     def get_value(self, read):
@@ -960,7 +961,7 @@
             outfile = arg
         elif arg[0] == '-':
             if not arg[1:] in _criteria:
-                print "Unknown criterion: %s" % arg
+                print("Unknown criterion: %s" % arg)
                 fail = True
             if crit_args:
                 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
@@ -968,7 +969,7 @@
         elif crit_args:
             crit_args.append(arg)
         else:
-            print "Unknown argument: %s" % arg
+            print("Unknown argument: %s" % arg)
             fail = True
 
     if not fail and crit_args:
@@ -979,11 +980,11 @@
             usage()
 
         if not infile:
-            print "Missing: input bamfile"
+            print("Missing: input bamfile")
         if not outfile:
-            print "Missing: output bamfile"
+            print("Missing: output bamfile")
         if not criteria:
-            print "Missing: filtering criteria"
+            print("Missing: filtering criteria")
         usage()
     else:
         bam_filter(infile, outfile, criteria, failed, verbose)
b
diff -r 7a68005de299 -r 9b9ae5963d3c macros.xml
--- a/macros.xml Sun Nov 27 15:01:21 2016 -0500
+++ b/macros.xml Mon Oct 23 13:27:02 2017 -0400
b
@@ -1,8 +1,8 @@
 <macros>
-    <token name="@WRAPPER_VERSION@">0.5.8</token>
+    <token name="@WRAPPER_VERSION@">0.5.9</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="0.9.1.4">pysam</requirement>
+            <requirement type="package" version="@WRAPPER_VERSION@">ngsutils</requirement>
         </requirements>
     </xml>
     <xml name="version">
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/__init__.pyc
b
Binary file ngsutils/__init__.pyc has changed
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.py
--- a/ngsutils/bam/__init__.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b"@@ -1,880 +0,0 @@\n-import os\n-import re\n-import sys\n-\n-import ngsutils.support\n-import pysam\n-try:\n-    from eta import ETA\n-except:\n-    pass\n-\n-\n-def bam_open(fname, mode='r', *args, **kwargs):\n-    if fname.lower()[-4:] == '.bam':\n-        return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)\n-    return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)\n-\n-\n-def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):\n-    if not quiet and bam.filename:\n-        eta = ETA(os.stat(bam.filename).st_size)\n-    else:\n-        eta = None\n-\n-    for pileup in bam.pileup(mask=mask):\n-        pos = bam.tell()\n-        bgz_offset = pos >> 16\n-\n-        if not quiet:\n-            if callback:\n-                eta.print_status(bgz_offset, extra=callback(pileup))\n-            else:\n-                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))\n-\n-        yield pileup\n-\n-    if eta:\n-        eta.done()\n-\n-\n-def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):\n-    '''\n-    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]\n-    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']\n-    '''\n-\n-    if os.path.exists('%s.bai' % bam.filename):\n-        # This is an indexed file, so it is ref sorted...\n-        # Meaning that we should show chrom:pos, instead of read names\n-        show_ref_pos = True\n-\n-    eta = None\n-\n-    if not ref:\n-        if not quiet and bam.filename:\n-            eta = ETA(os.stat(bam.filename).st_size)\n-\n-        for read in bam:\n-            pos = bam.tell()\n-            bgz_offset = pos >> 16\n-\n-            if not quiet and eta:\n-                if callback:\n-                    eta.print_status(bgz_offset, extra=callback(read))\n-                elif (show_ref_pos):\n-                    if read.tid > -1:\n-                        eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n-                    else:\n-                        eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))\n-                else:\n-                    eta.print_status(bgz_offset, extra='%s' % read.qname)\n-\n-            yield read\n-\n-    else:\n-        working_chrom = None\n-        if ref in bam.references:\n-            working_chrom = ref\n-        elif ref[0:3] == 'chr':\n-            # compensate for Ensembl vs UCSC ref naming\n-            if ref[3:] in bam.references:\n-                working_chrom = ref[3:]\n-\n-        if not working_chrom:\n-            raise ValueError('Missing reference: %s' % ref)\n-\n-        tid = bam.gettid(working_chrom)\n-\n-        if not start:\n-            start = 0\n-        if not end:\n-            end = bam.lengths[tid]\n-\n-        if not quiet and bam.filename:\n-            eta = ETA(end - start)\n-\n-        for read in bam.fetch(working_chrom, start, end):\n-            if not quiet and eta:\n-                if callback:\n-                    eta.print_status(read.pos - start, extra=callback(read))\n-                else:\n-                    eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n-\n-            yield read\n-\n-    if eta:\n-        eta.done()\n-\n-\n-def bam_batch_reads(bam, quiet=False):\n-    '''\n-    Batch mapping for the same reads (qname) together, this way\n-    they can all be compared/converted together.\n-    '''\n-    reads = []\n-    last = None\n-    for read in bam_iter(bam, quiet=quiet):\n-        if last and read.qname != last:\n-            yield reads\n-            reads = []\n-        last = read.qname\n-        reads.append(read)\n-\n-    if reads:\n-        yield reads\n-\n-\n-bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']\n-bam_cigar_op = {\n-    'M': 0,\n-    'I': 1,\n-    'D': 2,\n-    'N': 3,\n-    'S': 4,\n-    'H': 5,\n-    'P': 6,\n-    '=': 7,\n-    'X': 8,\n-}\n-\n-\n-def cigar_fromstr(s):\n-    '''\n-    >>> cigar_fromstr('10M5I20M')\n-    [(0, 10), (1, 5), ("..b'-    return True, \'\'\n-\n-\n-def read_alignment_fragments_gen(read):\n-    \'\'\'\n-    Takes a read and returns the start and end positions for each match/mismatch region.\n-    This will let us know where each read alignment "touches" the genome.\n-    \'\'\'\n-\n-    for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):\n-        yield (start, end)\n-\n-\n-def _read_alignment_fragments_gen(pos, cigar):\n-    \'\'\' Test-able version of read_alignment_fragments_gen\n-    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'50M\')))\n-    [(1, 51)]\n-\n-    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'25M100N25M\')))\n-    [(1, 26), (126, 151)]\n-\n-    >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'20M1D4M100N10M5I10M\')))\n-    [(1, 21), (22, 26), (126, 136), (136, 146)]\n-    \'\'\'\n-    ref_pos = pos\n-\n-    for op, length in cigar:\n-        if op == 0:\n-            yield (ref_pos, ref_pos + length)\n-            ref_pos += length\n-        elif op == 1:\n-            pass\n-        elif op == 2:\n-            ref_pos += length\n-        elif op == 3:\n-            ref_pos += length\n-        else:\n-            raise ValueError("Unsupported CIGAR operation: %s" % op)\n-\n-\n-def read_cigar_at_pos(cigar, qpos, is_del):\n-    \'\'\'\n-    Returns the CIGAR operation for a given read position\n-\n-    qpos is the 0-based index of the base in the read\n-    \'\'\'\n-    pos = 0\n-    returnnext = False\n-    for op, length in cigar:\n-        if returnnext:\n-            return op\n-        if op == 0:\n-            pos += length\n-        elif op == 1:\n-            pos += length\n-        elif op == 2:\n-            pass\n-        elif op == 3:\n-            pass\n-        elif op == 4:\n-            pos += length\n-        elif op == 5:\n-            pass\n-        else:\n-            raise ValueError("Unsupported CIGAR operation: %s" % op)\n-\n-        if pos > qpos:\n-            return op\n-        if is_del and pos == qpos:\n-            returnnext = True\n-\n-    return None\n-\n-\n-def cleancigar(cigar):\n-    \'\'\'\n-    Cleans a CIGAR alignment to remove zero length operations\n-\n-    >>> cigar_tostr(cleancigar(cigar_fromstr(\'31M0N52M18S\')))\n-    \'83M18S\'\n-\n-    \'\'\'\n-    newcigar = []\n-\n-    changed = False\n-    zero = False\n-\n-    for op, size in cigar:\n-        if size > 0:\n-            if zero and newcigar:\n-                if newcigar[-1][0] == op:\n-                    newsize = newcigar[-1][1] + size\n-                    newcigar[-1] = (op, newsize)\n-                zero = 0\n-            else:\n-                newcigar.append((op, size))\n-        else:\n-            changed = True\n-            zero = True\n-\n-    if changed:\n-        return newcigar\n-\n-    return None\n-\n-\n-def read_cleancigar(read):\n-    \'\'\'\n-    Replaces the CIGAR string for a read to remove any operations that are zero length.\n-\n-    This happens to the read in-place\n-    \'\'\'\n-    if read.is_unmapped:\n-        return False\n-\n-    newcigar = []\n-\n-    newcigar = cleancigar(read.cigar)\n-\n-    if newcigar:\n-        read.cigar = newcigar\n-        return True\n-\n-    return False\n-\n-\n-def read_to_unmapped(read, ref=None):\n-    \'\'\'\n-    Converts a read from mapped to unmapped.\n-\n-    Sets the \'ZR\' tag to indicate the original ref/pos/cigar (if ref is passed)\n-    \'\'\'\n-\n-    newread = pysam.AlignedRead()\n-\n-    if ref:\n-        tags = [(\'ZR\', \'%s:%s:%s\' % (ref, read.pos, cigar_tostr(read.cigar)))]\n-\n-    newread.is_unmapped = True\n-    newread.mapq = 0\n-    newread.tlen = 0\n-    newread.pos = -1\n-    newread.pnext = -1\n-    newread.rnext = -1\n-    newread.tid = -1\n-\n-    newread.qname = read.qname\n-\n-    if read.is_paired:\n-        newread.is_paired = True\n-\n-    if not read.is_unmapped and read.is_reverse:\n-        newread.seq = ngsutils.support.revcomp(read.seq)\n-        newread.qual = read.qual[::-1]\n-    else:\n-        newread.seq = read.seq\n-        newread.qual = read.qual\n-\n-    newread.tags = tags\n-\n-    return newread\n-\n-\n-if __name__ == \'__main__\':\n-    import doctest\n-    doctest.testmod()\n'
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.pyc
b
Binary file ngsutils/bam/__init__.pyc has changed
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.py
--- a/ngsutils/bed/__init__.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,282 +0,0 @@\n-import os\n-\n-import ngsutils.support.ngs_utils\n-import pysam\n-\n-\n-class BedStreamer(object):\n-    \'\'\'\n-    Streams BedRegions from a BED file\n-\n-    Note - this can only be used once! There is no mechanism to seek the stream.\n-    \'\'\'\n-\n-    def __init__(self, fname=None, fileobj=None, quiet=False):\n-        if not fname and not fileobj:\n-            raise ValueError("You must specify either fname or fileobj!")\n-\n-        self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)\n-\n-    def __iter__(self):\n-        return self\n-\n-    def next(self):\n-        try:\n-            while True:\n-                line = self.reader.next().strip()\n-                if line and line[0] != \'#\':\n-                    cols = line.split(\'\\t\')\n-                    while len(cols) < 6:\n-                        cols.append(\'\')\n-\n-                    return BedRegion(*cols)\n-        except:\n-            raise StopIteration\n-\n-\n-class BedFile(object):\n-    \'\'\'\n-    BED files are read in their entirety memory, in a series of bins. Each bin\n-    is ~100kb in size. Each bin can then be iterated over.\n-\n-    This is less efficient than using a proper index, but in reality, this\n-    usually isn\'t an issue. However, if the BED file has been Tabix indexed,\n-    that index will be used for random access.\n-\n-    NOTE: This isn\'t very efficient, so perhaps this can be remodeled into a BedFile\n-    and a BedFileIndex where the file is indexed only if random access is requested.\n-    \'\'\'\n-\n-    _bin_const = 100000\n-\n-    def __init__(self, fname=None, fileobj=None, region=None):\n-        self._bins = {}\n-        self._bin_list = []\n-        self._cur_bin_idx = 0\n-        self._cur_bin_pos = 0\n-        self._tellpos = 0\n-        self._total = 0\n-        self._length = 0\n-        self.__tabix = None\n-\n-        self.filename = fname\n-\n-        if os.path.exists(\'%s.tbi\' % fname):\n-            self.__tabix = pysam.Tabixfile(fname)\n-\n-        if fileobj:\n-            self.__readfile(fileobj)\n-        elif fname:\n-            with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:\n-                self.__readfile(fobj)\n-        elif region:\n-            chrom, startend = region.split(\':\')\n-            if \'-\' in startend:\n-                start, end = [int(x) for x in startend.split(\'-\')]\n-            else:\n-                start = int(startend)\n-                end = start\n-            start -= 1\n-\n-            self.__add_region(BedRegion(chrom, start, end))\n-        else:\n-            raise ValueError("Must specify either filename, fileobj, or region")\n-\n-    def __readfile(self, fobj):\n-        for line in fobj:\n-            line = line.strip()\n-            if line and line[0] != \'#\':\n-                cols = line.split(\'\\t\')\n-                while len(cols) < 6:\n-                    cols.append(\'\')\n-\n-                region = BedRegion(*cols)\n-                self.__add_region(region)\n-\n-        self._bin_list.sort()\n-        for bin in self._bins:\n-            self._bins[bin].sort()\n-\n-    def __add_region(self, region):\n-        self._total += region.end - region.start\n-        self._length += 1\n-\n-        startbin = region.start / BedFile._bin_const\n-        endbin = region.end / BedFile._bin_const\n-\n-        for bin in xrange(startbin, endbin + 1):\n-            if not (region.chrom, bin) in self._bins:\n-                self._bin_list.append((region.chrom, bin))\n-                self._bins[(region.chrom, bin)] = []\n-            self._bins[(region.chrom, bin)].append(region)\n-\n-    def fetch(self, chrom, start, end, strand=None):\n-        \'\'\'\n-        For TABIX indexed BED files, find all regions w/in a range\n-\n-        For non-TABIX index BED files, use the calculated bins, and\n-        output matching regions\n-        \'\'\'\n-\n-        if self.__tabix:\n-            for match in self.__tabix.fetch(chrom, start, end):\n-                region = BedRegion(*match.split(\'\\t\'))\n-                if not strand or (strand'..b"\n-\n-    def close(self):\n-        pass\n-\n-    @property\n-    def length(self):\n-        return self._length\n-\n-    @property\n-    def total(self):\n-        return self._total\n-\n-    def __iter__(self):\n-        self._cur_bin_idx = 0\n-        self._cur_bin_pos = 0\n-        self._tellpos = 0\n-        return self\n-\n-    def next(self):\n-        if self._cur_bin_idx >= len(self._bin_list):\n-            raise StopIteration\n-\n-        binvals = self._bins[self._bin_list[self._cur_bin_idx]]\n-        while self._cur_bin_pos < len(binvals):\n-            val = binvals[self._cur_bin_pos]\n-            self._cur_bin_pos += 1\n-\n-            startbin = (val.chrom, val.start / BedFile._bin_const)\n-            if startbin == self._bin_list[self._cur_bin_idx]:\n-                self._tellpos += 1\n-                return val\n-\n-        self._cur_bin_idx += 1\n-        self._cur_bin_pos = 0\n-        return self.next()\n-\n-\n-class BedRegion(object):\n-    def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):\n-        self.chrom = chrom\n-        self.start = int(start)\n-        self.end = int(end)\n-        self.name = name\n-\n-        if score == '':\n-            self.score = 0\n-        else:\n-            self.score = float(score)\n-\n-        if strand == '':\n-            self.strand = None\n-        else:\n-            self.strand = strand\n-\n-        if thickStart == '':\n-            self.thickStart = None\n-        else:\n-            self.thickStart = thickStart\n-\n-        if thickEnd == '':\n-            self.thickEnd = None\n-        else:\n-            self.thickEnd = thickEnd\n-\n-        if rgb == '':\n-            self.rgb = None\n-        else:\n-            self.rgb = rgb\n-\n-        self.extras = args\n-\n-    def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):\n-        cols = []\n-        cols.append(self.chrom if chrom is None else chrom)\n-        cols.append(self.start if start is None else start)\n-        cols.append(self.end if end is None else end)\n-        cols.append(self.name if name is None else name)\n-        cols.append(self.score if score is None else score)\n-        cols.append(self.strand if strand is None else strand)\n-        cols.append(self.thickStart if thickStart is None else thickStart)\n-        cols.append(self.thickEnd if thickEnd is None else thickEnd)\n-        cols.append(self.rgb if rgb is None else rgb)\n-\n-        for i, val in enumerate(self.extras):\n-            if len(args) > i:\n-                cols.append(args[i])\n-            else:\n-                cols.append(val)\n-\n-        return BedRegion(*cols)\n-\n-    @property\n-    def score_int(self):\n-        score = str(self.score)\n-        if score[-2:] == '.0':\n-            score = score[:-2]\n-\n-        return score\n-\n-    def __key(self):\n-        return (self.chrom, self.start, self.end, self.strand, self.name)\n-\n-    def __lt__(self, other):\n-        return self.__key() < other.__key()\n-\n-    def __gt__(self, other):\n-        return self.__key() > other.__key()\n-\n-    def __eq__(self, other):\n-        return self.__key() == other.__key()\n-\n-    def write(self, out):\n-        out.write('%s\\n' % self)\n-\n-    def __repr__(self):\n-        outcols = []\n-\n-        if self.rgb:\n-            outcols.append(self.rgb)\n-        if self.thickEnd or outcols:\n-            outcols.append(self.thickEnd if self.thickEnd else self.end)\n-        if self.thickStart or outcols:\n-            outcols.append(self.thickStart if self.thickStart else self.start)\n-        if self.strand or outcols:\n-            outcols.append(self.strand)\n-        if self.score_int != '' or outcols:\n-            outcols.append(self.score_int)\n-        if self.name or outcols:\n-            outcols.append(self.name)\n-\n-        outcols.append(self.end)\n-        outcols.append(self.start)\n-        outcols.append(self.chrom)\n-\n-        return '\\t'. join([str(x) for x in outcols[::-1]])\n"
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.pyc
b
Binary file ngsutils/bed/__init__.pyc has changed
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.py
--- a/ngsutils/support/__init__.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,254 +0,0 @@
-import collections
-import gzip
-import os
-import re
-import sys
-try:
-    from eta import ETA
-except:
-    pass
-
-
-class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
-    def __repr__(self):
-        if self.comment:
-            return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
-        return '>%s\n%s\n' % (self.name, self.seq)
-
-    def subseq(self, start, end, comment=None):
-        if self.comment:
-            comment = '%s %s' % (self.comment, comment)
-
-        return FASTARead(self.name, comment, self.seq[start:end])
-
-    def clone(self, name=None, comment=None, seq=None):
-        n = name if name else self.name
-        c = comment if comment else self.comment
-        s = seq if seq else self.seq
-
-        return FASTARead(n, c, s)
-
-    def write(self, out):
-        out.write(repr(self))
-
-
-class FASTA(object):
-    def __init__(self, fname=None, fileobj=None, qual=False):
-        self.fname = fname
-        self.qual = qual
-        if fileobj:
-            self.fileobj = fileobj
-        else:
-            if self.fname == '-':
-                self.fileobj = sys.stdin
-            elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
-                self.fileobj = gzip.open(os.path.expanduser(self.fname))
-            else:
-                self.fileobj = open(os.path.expanduser(self.fname))
-
-        if not self.fileobj:
-            raise ValueError("Missing valid filename or fileobj")
-
-    def close(self):
-        if self.fileobj != sys.stdout:
-            self.fileobj.close()
-
-    def tell(self):
-        # always relative to uncompressed...
-        return self.fileobj.tell()
-
-    def seek(self, pos, whence=0):
-        self.fileobj.seek(pos, whence)
-
-    def fetch(self, quiet=False):
-        name = ''
-        comment = ''
-        seq = ''
-
-        if not quiet and self.fname and self.fname != '-':
-            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
-        else:
-            eta = None
-
-        for line in self.fileobj:
-            line = line.strip()
-            if not line:
-                continue
-            if line[0] == '#':
-                continue
-
-            if line[0] == '>':
-                if name and seq:
-                    if eta:
-                        eta.print_status(extra=name)
-                    yield FASTARead(name, comment, seq)
-
-                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
-                name = spl[0]
-                if len(spl) > 1:
-                    comment = spl[1]
-                else:
-                    comment = ''
-                seq = ''
-
-            else:
-                if self.qual:
-                    seq = seq + ' ' + line
-                else:
-                    seq += line
-
-        if name and seq:
-            if eta:
-                eta.print_status(extra=name)
-            yield FASTARead(name, comment, seq)
-
-        if eta:
-            eta.done()
-
-
-def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
-    if fileobj:
-        f = fileobj
-    elif fname == '-':
-        f = sys.stdin
-    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
-        f = gzip.open(os.path.expanduser(fname))
-    else:
-        f = open(os.path.expanduser(fname))
-
-    if quiet or fname == '-':
-        eta = None
-    else:
-        eta = ETA(os.stat(fname).st_size, fileobj=f)
-
-    for line in f:
-        if eta:
-            if callback:
-                extra = callback()
-            else:
-                extra = ''
-
-            eta.print_status(extra=extra)
-        yield line
-
-        if done_callback and done_callback():
-                break
-
-    if f != sys.stdin:
-        f.close()
-
-    if eta:
-        eta.done()
-
-
-class Symbolize(object):
-    'Converts strings to symbols - basically a cache of strings'
-
-    def __init__(self):
-        self.__cache = {}
-
-    def __getitem__(self, k):
-        if k not in self.__cache:
-            self.__cache[k] = k
-
-        return self.__cache[k]
-
-
-symbols = Symbolize()
-
-_compliments = {
-    'a': 't',
-    'A': 'T',
-    'c': 'g',
-    'C': 'G',
-    'g': 'c',
-    'G': 'C',
-    't': 'a',
-    'T': 'A',
-    'n': 'n',
-    'N': 'N'
-}
-
-
-def revcomp(seq):
-    '''
-    >>> revcomp('ATCGatcg')
-    'cgatCGAT'
-    '''
-    ret = []
-
-    for s in seq:
-        ret.append(_compliments[s])
-
-    ret.reverse()
-    return ''.join(ret)
-
-
-class Counts(object):
-    '''
-    Setup simple binning.  Bins are continuous 0->max.  Values are added to
-    bins and then means / distributions can be calculated.
-    '''
-
-    def __init__(self):
-        self.bins = []
-
-    def add(self, val):
-        while len(self.bins) <= val:
-            self.bins.append(0)
-        self.bins[val] += 1
-
-    def mean(self):
-        acc = 0
-        count = 0
-
-        for i, val in enumerate(self.bins):
-            acc += (i * val)
-            count += val
-
-        if count > 0:
-            return float(acc) / count
-
-    def max(self):
-        return len(self.bins) - 1
-
-
-def memoize(func):
-    if 'TESTING' in os.environ or 'DEBUG' in os.environ:
-        return func
-
-    __cache = {}
-
-    def inner(*args, **kwargs):
-        k = (args, tuple(kwargs.iteritems()))
-        if k not in __cache:
-            __cache[k] = func(*args, **kwargs)
-        return __cache[k]
-
-    inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
-    return inner
-
-
-def quoted_split(s, delim, quote_char='"'):
-    tokens = []
-
-    buf = ""
-    inquote = False
-
-    for c in s:
-        if inquote:
-            buf += c
-            if c == quote_char:
-                inquote = False
-        elif c == delim:
-            tokens.append(buf)
-            buf = ""
-        else:
-            buf += c
-            if c == quote_char:
-                inquote = True
-
-    if buf:
-        tokens.append(buf)
-
-    return tokens
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.pyc
b
Binary file ngsutils/support/__init__.pyc has changed
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/bgzip.py
--- a/ngsutils/support/bgzip.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,138 +0,0 @@
-#!/usr/bin/env python
-'''
-Extract the blocks from a BGZip file.
-
-BAM files are stored as blocks in a bgzip archive. This class
-will load the bgzip archive and output the block information.
-'''
-
-import os
-import struct
-import sys
-
-
-class BGZip(object):
-    def __init__(self, fname):
-        self.fname = fname
-        self.pos = 0
-        self.fileobj = open(self.fname)
-        self.fsize = os.stat(self.fname).st_size
-        self.cur_chunk = 0
-
-        self.cpos = 0
-        self.cdata = 0
-
-    def close(self):
-        self.fileobj.close()
-
-    def seek(self, offset):
-        bgz_offset = offset >> 16
-        chunk_offset = offset & 0xFFFF
-
-        self.fileobj.seek(bgz_offset)
-        self.read_chunk()
-        self.chunk_pos = chunk_offset
-
-    def read(self, amount, whence=0):
-        if whence not in [0, 1]:
-            print "Bad Whence value!: %s" % whence
-            return
-
-        if whence == 0:
-            self.seek(0, 0)
-
-        # read into chunk, if not enough data in chunk, read next chunk
-        ret = ''
-        while amount and self.pos < self.fsize:
-            if len(self.cdata) - self.cpos < amount:
-                ret += self.cdata[self.cpos:self.cpos + amount]
-                self.cpos += amount
-                return ret
-            else:
-                ret += self.cdata[self.cpos:]
-                amount = amount - len(ret)
-                self.read_chunk()
-        return ret
-
-    def read_chunk(self):
-        self.fileobj.seek(10)
-        id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
-        subpos = 0
-        bsize = 0
-
-        while subpos < xlen:
-            si1, si2, slen = self._read_fields('<BBH')
-            if si1 == 66 and si2 == 67:
-                bsize, = self._read_fields('<H')
-            else:
-                self.fileobj.seek(slen, 1)
-                self.pos += slen
-
-            subpos += 6 + slen
-
-        cdata_size = bsize - xlen - 19
-        self.cdata = self.fileobj.read(cdata_size)  # inflate value
-        self.fileobj.seek(8)
-
-        self.cur_chunk += 1
-        self.cpos = 0
-
-    def dump(self):
-        self.fileobj.seek(0)
-        block_num = 0
-
-        while self.pos < self.fsize:
-            print "[BLOCK %s]" % block_num
-            # read header
-            id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
-            print 'id1: %s' % id1
-            print 'id2: %s' % id2
-            print 'cm: %s' % cm
-            print 'flg: %s' % flg
-            print 'mtime: %s' % mtime
-            print 'xfl: %s' % xfl
-            print 'os: %s' % os
-            print 'xlen: %s' % xlen
-
-            # read subfields
-            subpos = 0
-            bsize = 0
-
-            while subpos < xlen:
-                si1, si2, slen = self._read_fields('<BBH')
-                print '    si1: %s' % si1
-                print '    si1: %s' % si2
-                print '    slen: %s' % slen
-                print '    data: [%s]' % slen
-
-                if si1 == 66 and si2 == 67:
-                    bsize, = self._read_fields('<H')
-                else:
-                    self.fileobj.seek(slen, 1)
-                    self.pos += slen
-
-                subpos += 6 + slen
-
-            cdata_size = bsize - xlen - 19
-
-            print 'bsize: %s' % bsize
-            print 'cdata: [%s]' % (cdata_size)
-
-            self.fileobj.seek(cdata_size, 1)
-            self.pos += cdata_size
-            crc, isize = self._read_fields('<II')
-
-            print "crc: %s" % crc
-            print "isize: %s" % isize
-            # print "POS: %s" % self.pos
-
-            block_num += 1
-
-    def _read_fields(self, field_types):
-        size = struct.calcsize(field_types)
-        self.pos += size
-        return struct.unpack(field_types, self.fileobj.read(size))
-
-
-if __name__ == '__main__':
-    print BGZip(sys.argv[1]).dump()
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.py
--- a/ngsutils/support/dbsnp.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,142 +0,0 @@
-'''
-Support package for processing a dbSNP tabix dump from UCSC.
-'''
-
-import collections
-import sys
-
-import pysam
-from ngsutils.support import revcomp
-
-
-class SNPRecord(collections.namedtuple('SNPRecord', '''bin
-chrom
-chromStart
-chromEnd
-name
-score
-strand
-refNCBI
-refUCSC
-observed
-molType
-clazz
-valid
-avHet
-avHetSE
-func
-locType
-weight
-exceptions
-submitterCount
-submitters
-alleleFreqCount
-alleles
-alleleNs
-alleleFreqs
-bitfields
-''')):
-    __slots__ = ()
-
-    @property
-    def alleles(self):
-        alts = []
-        for alt in self.observed.split('/'):
-            if alt != '-' and self.strand == '-':
-                alt = revcomp(alt)
-
-            alts.append(alt)
-
-        return alts
-
-    @property
-    def snp_length(self):
-        return self.chromEnd - self.chromStart
-
-
-def autotype(ar, length=26):
-    out = []
-    for el in ar:
-        val = None
-        try:
-            val = int(el)
-        except:
-            try:
-                val = float(el)
-            except:
-                val = el
-
-        out.append(val)
-
-    if len(out) < 12:
-        raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.")
-
-    while len(out) < length:
-        out.append(None)
-    return out
-
-
-class DBSNP(object):
-    def __init__(self, fname):
-        self.dbsnp = pysam.Tabixfile(fname)
-        self.asTup = pysam.asTuple()
-
-    def fetch(self, chrom, pos):
-        'Note: pos is 0-based'
-
-        # Note: tabix the command uses 1-based positions, but
-        #       pysam.Tabixfile uses 0-based positions
-
-        for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup):
-            snp = SNPRecord._make(autotype(tup))
-            if snp.chromStart == pos:
-                yield snp
-
-    def close(self):
-        self.dbsnp.close()
-
-    def dump(self, chrom, op, pos, base, snp, exit=True):
-        print
-        print ' ->', op, chrom, pos, base
-        print snp
-        print snp.alleles
-        if exit:
-            sys.exit(1)
-
-    def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
-        for snp in self.fetch(chrom, pos):
-            if '/' not in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
-                # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
-                continue
-
-            if op == 0:
-                if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
-                    return True
-                # elif verbose:
-                #     for alt in snp.alleles:
-                #         if len(alt) > 1:
-                #             self.dump(chrom, op, pos, seq, snp, False)
-
-            elif op == 1:
-                if snp.clazz in ['insertion', 'mixed', 'in-del']:
-                    if seq in snp.alleles:
-                        if len(seq) > 1 and verbose:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        return True
-
-                    if verbose:
-                        if len(seq) > 1:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        else:
-                            for alt in snp.alleles:
-                                if len(alt) > 1:
-                                    self.dump(chrom, op, pos, seq, snp, False)
-
-            elif op == 2:
-                if snp.clazz in ['deletion', 'mixed', 'in-del']:
-                    if '-' in snp.alleles and seq in snp.alleles:
-                        if len(seq) > 1 and verbose:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        return True
-
-        return False
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.pyc
b
Binary file ngsutils/support/dbsnp.pyc has changed
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/llh.py
--- a/ngsutils/support/llh.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,56 +0,0 @@
-'''
-Methods for calculating log-likelihoods for nucleotide frequencies
-'''
-import collections
-import math
-
-from ngsutils.support import memoize
-
-_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
-
-NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
-
-
-@memoize
-def pseudo_count(N, bg):
-    '''
-    >>> pseudo_count(100, _default_background['A'])
-    3
-    >>> pseudo_count(100, _default_background['C'])
-    2
-    '''
-
-    return bg * math.sqrt(N)
-
-
-def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
-    if pseudo == 'auto':
-        N = A + C + G + T
-        Ap = float(A) + pseudo_count(N, bg['A'])
-        Cp = float(C) + pseudo_count(N, bg['C'])
-        Gp = float(G) + pseudo_count(N, bg['G'])
-        Tp = float(T) + pseudo_count(N, bg['T'])
-    elif pseudo:
-        Ap = float(A) + pseudo
-        Cp = float(C) + pseudo
-        Gp = float(G) + pseudo
-        Tp = float(T) + pseudo
-    else:
-        Ap = float(A)
-        Cp = float(C)
-        Gp = float(G)
-        Tp = float(T)
-
-    Np = Ap + Cp + Gp + Tp
-
-    freqA = Ap / Np
-    freqC = Cp / Np
-    freqG = Gp / Np
-    freqT = Tp / Np
-
-    return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.py
--- a/ngsutils/support/ngs_utils.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,225 +0,0 @@
-#!/usr/bin/env python
-"""
-Common util classes / functions for the NGS project
-"""
-import collections
-import gzip
-import os
-import re
-import sys
-
-
-def format_number(n):
-    '''
-    >>> format_number(1000)
-    '1,000'
-    >>> format_number(1234567)
-    '1,234,567'
-    '''
-    ar = list(str(n))
-    for i in range(len(ar))[::-3][1:]:
-        ar.insert(i + 1, ',')
-    return ''.join(ar)
-
-
-def natural_sort(ar):
-    '''
-    >>> natural_sort('1 3 4 2 5'.split())
-    ['1', '2', '3', '4', '5']
-    >>> natural_sort('1 10 20 2 3 4'.split())
-    ['1', '2', '3', '4', '10', '20']
-    '''
-    to_sort = []
-    for item in ar:
-        spl = re.split('(\d+)', item)
-        l2 = []
-        for el in spl:
-            try:
-                n = int(el)
-            except:
-                n = el
-            l2.append(n)
-        to_sort.append((l2, item))
-
-    to_sort.sort()
-    return [x[1] for x in to_sort]
-
-
-def dictify(values, colnames):
-    """
-    Convert a list of values into a dictionary based upon given column names.
-
-    If the column name starts with an '@', the value is assumed to be a comma
-    separated list.
-
-    If the name starts with a '#', the value is assumed to be an int.
-
-    If the name starts with '@#', the value is assumed to  a comma separated
-    list of ints.
-
-    """
-    d = {}
-    for i in xrange(len(colnames)):
-        key = colnames[i]
-        split = False
-        num = False
-
-        if key[0] == '@':
-            key = key[1:]
-            split = True
-        if key[0] == '#':
-            key = key[1:]
-            num = True
-
-        if i < len(values):
-            if num and split:
-                val = [int(x) for x in values[i].rstrip(',').split(',')]
-            elif num:
-                val = int(values[i])
-            elif split:
-                val = values[i].rstrip(',').split(',')
-            else:
-                val = values[i]
-
-            d[key] = val
-
-        else:
-            d[key] = None
-
-    return d
-
-
-def gzip_aware_open(fname):
-    if fname == '-':
-        f = sys.stdin
-    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
-        f = gzip.open(os.path.expanduser(fname))
-    else:
-        f = open(os.path.expanduser(fname))
-    return f
-
-
-class gzip_opener:
-    '''
-    A Python 2.6 class to handle 'with' opening of text files that may
-    or may not be gzip compressed.
-    '''
-
-    def __init__(self, fname):
-        self.fname = fname
-
-    def __enter__(self):
-        self.f = gzip_aware_open(self.fname)
-        return self.f
-
-    def __exit__(self, type, value, traceback):
-        if self.f != sys.stdin:
-            self.f.close()
-        return False
-
-
-def filenames_to_uniq(names, new_delim='.'):
-    '''
-    Given a set of file names, produce a list of names consisting of the
-    uniq parts of the names. This works from the end of the name.  Chunks of
-    the name are split on '.' and '-'.
-
-    For example:
-        A.foo.bar.txt
-        B.foo.bar.txt
-        returns: ['A','B']
-
-        AA.BB.foo.txt
-        CC.foo.txt
-        returns: ['AA.BB','CC']
-
-    >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
-    ['a', 'b']
-    >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
-    ['a.b', 'c']
-
-    '''
-    name_words = []
-    maxlen = 0
-    for name in names:
-        name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
-        name_words[-1].reverse()
-        if len(name_words[-1]) > maxlen:
-            maxlen = len(name_words[-1])
-
-    common = [False, ] * maxlen
-    for i in xrange(maxlen):
-        last = None
-        same = True
-        for nameword in name_words:
-            if i >= len(nameword):
-                same = False
-                break
-            if not last:
-                last = nameword[i]
-            elif nameword[i] != last:
-                same = False
-                break
-        common[i] = same
-
-    newnames = []
-    for nameword in name_words:
-        nn = []
-        for (i, val) in enumerate(common):
-            if not val and i < len(nameword):
-                nn.append(nameword[i])
-        nn.reverse()
-        newnames.append(new_delim.join(nn))
-    return newnames
-
-
-def parse_args(argv, defaults=None, expected_argc=0):
-    opts = {}
-    if defaults:
-        opts.update(defaults)
-
-    args = []
-
-    i = 0
-    while i < len(argv):
-        if argv[i][0] == '-':
-            arg = argv[i].lstrip('-')
-            if '=' in arg:
-                k, v = arg.split('=', 2)
-                if k in defaults:
-                    if type(defaults[k]) == float:
-                        opts[k] = float(v)
-                    elif type(defaults[k]) == int:
-                        opts[k] = int(v)
-                    else:
-                        opts[k] = v
-            else:
-                opts[arg] = True
-        else:
-            args.append(argv[i])
-        i += 1
-
-    while len(args) < expected_argc:
-        args.append(None)
-    return opts, args
-
-
-class memoize(object):
-    'Simple memoizing decorator to cache results'
-
-    def __init__(self, func):
-        self.func = func
-        self.cache = {}
-
-    def __call__(self, *args):
-        if not isinstance(args, collections.Hashable):
-            # uncacheable. a list, for instance.
-            # better to not cache than blow up.
-            return self.func(*args)
-
-        if args in self.cache:
-            return self.cache[args]
-        else:
-            value = self.func(*args)
-            self.cache[args] = value
-            return value
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.pyc
b
Binary file ngsutils/support/ngs_utils.pyc has changed
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/regions.py
--- a/ngsutils/support/regions.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,166 +0,0 @@
-class RangeMatch(object):
-    '''
-    Simple genomic ranges.  You can define chrom:start-end ranges, then ask if a
-    particular genomic coordinate maps to any of those ranges.  This is less-
-    efficient than an R-Tree, but easier to code.
-    '''
-
-    def __init__(self, name):
-        self.ranges = {}
-        self.name = name
-
-    def add_range(self, chrom, strand, start, end):
-        if chrom not in self.ranges:
-            self.ranges[chrom] = {}
-
-        bin = start / 100000
-        if bin not in self.ranges[chrom]:
-            self.ranges[chrom][bin] = []
-        self.ranges[chrom][bin].insert(0, (start, end, strand))
-
-        if (end / 100000) != bin:
-            for bin in xrange(bin + 1, (end / 100000) + 1):
-                if bin not in self.ranges[chrom]:
-                    self.ranges[chrom][bin] = []
-                self.ranges[chrom][bin].insert(0, (start, end, strand))
-
-    def get_tag(self, chrom, strand, pos, ignore_strand=False):
-        '''
-        returns (region, is_reverse_orientation)
-        '''
-        if chrom not in self.ranges:
-            return None, False
-        bin = pos / 100000
-        if bin not in self.ranges[chrom]:
-            return None, False
-        for start, end, r_strand in self.ranges[chrom][bin]:
-            if pos >= start and pos <= end:
-                if ignore_strand or strand == r_strand:
-                    return self.name, False
-                return self.name, True
-        return None, False
-
-
-class RegionTagger(object):
-    def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
-        self.regions = []
-        self.counts = {}
-        self.only_first_fragment = only_first_fragment
-
-        coding = RangeMatch('coding')
-        exons = RangeMatch('other-exon')
-        utr_5 = RangeMatch('utr-5')
-        utr_3 = RangeMatch('utr-3')
-        introns = RangeMatch('intron')
-        promoters = RangeMatch('promoter')
-
-        for gene in gtf.genes:
-            if valid_chroms and gene.chrom not in valid_chroms:
-                continue
-            if gene.strand == '+':
-                promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
-            else:
-                promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
-
-            for transcript in gene.transcripts:
-                if transcript.has_cds:
-                    for start, end in transcript.cds:
-                        coding.add_range(gene.chrom, gene.strand, start, end)
-
-                    # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
-                    for s, e in transcript.utr_5:
-                        utr_5.add_range(gene.chrom, gene.strand, s, e)
-                    for s, e in transcript.utr_3:
-                        utr_3.add_range(gene.chrom, gene.strand, s, e)
-
-                last_end = None
-                for start, end in transcript.exons:
-                    if last_end:
-                        introns.add_range(gene.chrom, gene.strand, last_end, start)
-                    exons.add_range(gene.chrom, gene.strand, start, end)
-                    last_end = end
-
-        self.regions.append(coding)
-        self.regions.append(utr_5)
-        self.regions.append(utr_3)
-        self.regions.append(exons)
-        self.regions.append(introns)
-        self.regions.append(promoters)
-
-        self.counts['coding'] = 0
-        self.counts['coding-rev'] = 0
-        self.counts['other-exon'] = 0
-        self.counts['utr-5'] = 0
-        self.counts['utr-3'] = 0
-        self.counts['utr-5-rev'] = 0
-        self.counts['utr-3-rev'] = 0
-        self.counts['intron'] = 0
-        self.counts['promoter'] = 0
-        self.counts['other-exon-rev'] = 0
-        self.counts['intron-rev'] = 0
-        self.counts['promoter-rev'] = 0
-        self.counts['junction'] = 0
-        self.counts['intergenic'] = 0
-        self.counts['mitochondrial'] = 0
-
-    def add_read(self, read, chrom):
-        if read.is_unmapped:
-            return
-
-        if self.only_first_fragment and read.is_paired and not read.is_read1:
-            return
-
-        tag = None
-        is_rev = False
-
-        strand = '-' if read.is_reverse else '+'
-
-        if chrom == 'chrM':
-            tag = 'mitochondrial'
-
-        if not tag:
-            for op, length in read.cigar:
-                if op == 3:
-                    tag = 'junction'
-                    break
-
-        if not tag:
-            for region in self.regions:
-                tag, is_rev = region.get_tag(chrom, strand, read.pos)
-                if tag:
-                    break
-
-        if not tag:
-            tag = 'intergenic'
-
-        if tag:
-            if is_rev:
-                self.counts['%s-rev' % tag] += 1
-            else:
-                self.counts[tag] += 1
-
-        return tag
-
-    def tag_region(self, chrom, start, end, strand):
-        tag = None
-        is_rev = False
-
-        if chrom == 'chrM' or chrom == 'M':
-            tag = 'mitochondrial'
-
-        if not tag:
-            for region in self.regions:
-                tag, is_rev = region.get_tag(chrom, strand, start)
-                if is_rev:
-                    tag = '%s-rev' % tag
-
-                if start != end:
-                    endtag, is_rev = region.get_tag(chrom, strand, end)
-                    if is_rev:
-                        endtag = '%s-rev' % endtag
-
-                if tag and endtag and endtag != tag:
-                    tag = '%s/%s' % (tag, endtag)
-
-        if not tag:
-            tag = 'intergenic'
b
diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/stats.py
--- a/ngsutils/support/stats.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,166 +0,0 @@
-'''
-various statistical tests and methods...
-'''
-import math
-
-from ngsutils.support import memoize
-
-
-def median(vals):
-    '''
-    >>> median([1,2,3])
-    2
-    >>> median([1,2,3,4])
-    2.5
-    '''
-    vals.sort()
-
-    if len(vals) % 2 == 1:
-        return vals[len(vals) / 2]
-    else:
-        a = vals[(len(vals) / 2) - 1]
-        b = vals[(len(vals) / 2)]
-        return float(a + b) / 2
-
-
-def mean_stdev(l):
-    '''
-    >>> mean_stdev([1,2,2,2])
-    (1.75, 0.5)
-    >>> mean_stdev([2,2,2,2])
-    (2.0, 0.0)
-    '''
-
-    acc = 0
-    for el in l:
-        acc += el
-
-    mean = float(acc) / len(l)
-    acc = 0
-    for el in l:
-        acc += (el - mean) ** 2
-
-    if len(l) > 2:
-        stdev = math.sqrt(float(acc) / (len(l) - 1))
-    else:
-        stdev = 0.0
-
-    return (mean, stdev)
-
-
-def counts_median(d):
-    '''
-    Calculate the median from counts stored in a dictionary
-    >>> counts_median({ 1: 4, 2: 1, 3: 4 })
-    2
-    >>> counts_median({ 1: 4, 3: 4 })
-    2
-
-    '''
-    count = 0
-    for k in d:
-        count += d[k]
-
-    if count == 0:
-        return 0
-
-    acc = 0.0
-    last = 0
-    for k in sorted(d):
-        if last:
-            return (last + k) / 2
-        acc += d[k]
-        if acc / count == 0.5:
-            last = k
-        elif acc / count > 0.5:
-            return k
-
-
-def counts_mean_stdev(d):
-    '''
-
-    calc mean / stdev when data is stored as counts in a dictionary
-
-    Ex:
-        { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
-
-    >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
-    (2.0, 1.0)
-
-    '''
-
-    acc = 0
-    count = 0
-    for k in d:
-        acc += k * d[k]
-        count += d[k]
-
-    mean = float(acc) / count
-
-    acc = 0
-    for k in d:
-        acc += (((k - mean) ** 2) * d[k])
-
-    if count > 2:
-        stdev = math.sqrt(float(acc) / (count - 1))
-    else:
-        stdev = 0.0
-
-    return (mean, stdev)
-
-
-@memoize
-def poisson_prob(x, mean):
-    '''
-        Return the probability that you could get x counts in
-        a Poisson test with a mean value.
-
-        prob(x) = sum(i=1..x){poisson(i)}
-
-        >>> poisson_prob(6,10)
-        0.1300960209527205
-        >>> poisson_prob(8,10)
-        0.33277427882095645
-    '''
-    acc = 0.0
-    for i in xrange(1, x + 1):
-        acc += poisson_func(i, mean)
-    return acc
-
-
-@memoize
-def poisson_func(mu, lambd):
-    '''
-        This is the Poisson distribution function
-
-        p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
-
-        mu is a count
-        lambd is the mean
-
-        >>> poisson_func(1,10)
-        0.00045399929762484856
-        >>> poisson_func(2,10)
-        0.0022699964881242427
-        >>> poisson_func(3,10)
-        0.007566654960414142
-    '''
-    return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
-
-
-@memoize
-def _factorial(x):
-    '''
-    >>> _factorial(1)
-    1
-    >>> _factorial(2)
-    2
-    >>> _factorial(3)
-    6
-    '''
-    return math.factorial(x)
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()