Galaxy |

Changeset 3:9b9ae5963d3c (2017-10-23)

Previous changeset 2:7a68005de299 (2016-11-27) Next changeset 4:aecfe10118ed (2017-11-17)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 4a3c9f195ba5d899b1a1ce5e80281cdf230f456a

modified:
bam_filter.xml
filter.py
macros.xml

removed:
ngsutils/__init__.py
ngsutils/__init__.pyc
ngsutils/bam/__init__.py
ngsutils/bam/__init__.pyc
ngsutils/bed/__init__.py
ngsutils/bed/__init__.pyc
ngsutils/support/__init__.py
ngsutils/support/__init__.pyc
ngsutils/support/bgzip.py
ngsutils/support/dbsnp.py
ngsutils/support/dbsnp.pyc
ngsutils/support/llh.py
ngsutils/support/ngs_utils.py
ngsutils/support/ngs_utils.pyc
ngsutils/support/regions.py
ngsutils/support/stats.py

diff -r 7a68005de299 -r 9b9ae5963d3c bam_filter.xml
--- a/bam_filter.xml Sun Nov 27 15:01:21 2016 -0500
+++ b/bam_filter.xml Mon Oct 23 13:27:02 2017 -0400

[

@@ -1,4 +1,4 @@
-<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.1">
+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@">
     <description>Removes reads from a BAM file based on criteria</description>
     <macros>
         <import>macros.xml</import>
@@ -7,67 +7,61 @@
     <expand macro="stdio" />
     <expand macro="version" />
     <command><![CDATA[
-
-    ## If the tool is executed with no filtering option,
-    ## the default parameters simply copy over the input file
-    if grep -q "\w" ${parameters};
-    then
-        $__tool_directory__/filter.py
-        $infile
-        $outfile
-        `cat ${parameters}`;
-    else
-        cp $infile $outfile;
-    fi
-
-]]>
-    </command>
+## If the tool is executed with no filtering option,
+## the default parameters simply copy over the input file
+if grep -q "\w" '${parameters}'; then
+    '$__tool_directory__/filter.py'
+    '$infile'
+    '$outfile'
+    `cat ${parameters}`;
+else
+    cp '$infile' '$outfile';
+fi
+    ]]></command>
     <configfiles>
-        <configfile name="parameters">
-<![CDATA[
-        #if $minlen:
-            -minlen $minlen
-        #end if
-        #if $maxlen
-            -maxlen $maxlen
-        #end if
-        $mapped
-        $unmapped
-        $properpair
-        $noproperpair
-        #if $mask:
-            -mask ${mask}
-        #end if
-        #if int($uniq) > -1:
-            -uniq
-            #if int($uniq) > 0:
-                $uniq
-            #end if
-        #end if
-        $uniq_start
-        #if $mismatch:
-            -mismatch $mismatch
-        #end if
-        $nosecondary
-        $noqcfail
-        $nopcrdup
-        #if $excludebed:
-            -excludebed ${excludebed} $ignore_strand
-        #end if
-        #if $includebed:
-            -includebed ${includebed} $ignore_strand
-        #end if
-        #if $includeref:
-            -includeref ${includeref}
-        #end if
-        #if $excluderef:
-            -excluderef ${excluderef}
-        #end if
-        #if $maximum_mismatch_ratio
-            -maximum_mismatch_ratio $maximum_mismatch_ratio
-        #end if
-    ]]>
-        </configfile>
+        <configfile name="parameters"><![CDATA[
+#if $minlen:
+    -minlen $minlen
+#end if
+#if $maxlen
+    -maxlen $maxlen
+#end if
+$mapped
+$unmapped
+$properpair
+$noproperpair
+#if $mask:
+    -mask ${mask}
+#end if
+#if int($uniq) > -1:
+    -uniq
+    #if int($uniq) > 0:
+        $uniq
+    #end if
+#end if
+$uniq_start
+#if $mismatch:
+    -mismatch $mismatch
+#end if
+$nosecondary
+$noqcfail
+$nopcrdup
+#if $excludebed:
+    -excludebed ${excludebed} $ignore_strand
+#end if
+#if $includebed:
+    -includebed ${includebed} $ignore_strand
+#end if
+#if $includeref:
+    -includeref ${includeref}
+#end if
+#if $excluderef:
+    -excluderef ${excluderef}
+#end if
+#if $maximum_mismatch_ratio
+    -maximum_mismatch_ratio $maximum_mismatch_ratio
+#end if
+        ]]></configfile>
     </configfiles>
     <inputs>
         <param name="infile" type="data" format="bam" label="Select BAM dataset" />
@@ -129,7 +123,6 @@
         <param argument="-maximum_mismatch_ratio" type="float" value="" optional="True" min="0.0" max="1.0"
             label="Filter by maximum mismatch ratio"
             help="fraction of length"/>
-
     </inputs>
     <outputs>
         <data format="bam" name="outfile" />
@@ -234,6 +227,5 @@
| -excluderef refname            | Exclude reads mapped to a particular reference  |
|                                | (e.g. chrM, or _dup chromosomes)                |
+--------------------------------+-------------------------------------------------+
-]]>
-    </help>
+    ]]></help>
</tool>

diff -r 7a68005de299 -r 9b9ae5963d3c filter.py
--- a/filter.py Sun Nov 27 15:01:21 2016 -0500
+++ b/filter.py Mon Oct 23 13:27:02 2017 -0400

[

@@ -107,6 +107,7 @@
     The tag type (:i, :f, :Z) is optional.

"""
+from __future__ import print_function

import os
import sys
@@ -118,8 +119,8 @@

def usage():
-    print __doc__
-    print """
+    print(__doc__)
+    print("""
Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria...

Options:
@@ -131,7 +132,7 @@

This will remove all unmapped reads, as well as any reads that have an AS:i
value less than 1000.
-"""
+""")
     sys.exit(1)

@@ -760,10 +761,10 @@
             # guess at type...
             try:
                 self.value = int(value)
-            except:
+            except ValueError:
                 try:
                     self.value = float(value)
-                except:
+                except ValueError:
                     self.value = value

     def get_value(self, read):
@@ -960,7 +961,7 @@
             outfile = arg
         elif arg[0] == '-':
             if not arg[1:] in _criteria:
-                print "Unknown criterion: %s" % arg
+                print("Unknown criterion: %s" % arg)
                 fail = True
             if crit_args:
                 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
@@ -968,7 +969,7 @@
         elif crit_args:
             crit_args.append(arg)
         else:
-            print "Unknown argument: %s" % arg
+            print("Unknown argument: %s" % arg)
             fail = True

     if not fail and crit_args:
@@ -979,11 +980,11 @@
             usage()

         if not infile:
-            print "Missing: input bamfile"
+            print("Missing: input bamfile")
         if not outfile:
-            print "Missing: output bamfile"
+            print("Missing: output bamfile")
         if not criteria:
-            print "Missing: filtering criteria"
+            print("Missing: filtering criteria")
         usage()
     else:
         bam_filter(infile, outfile, criteria, failed, verbose)

diff -r 7a68005de299 -r 9b9ae5963d3c macros.xml
--- a/macros.xml Sun Nov 27 15:01:21 2016 -0500
+++ b/macros.xml Mon Oct 23 13:27:02 2017 -0400

@@ -1,8 +1,8 @@
<macros>
-    <token name="@WRAPPER_VERSION@">0.5.8</token>
+    <token name="@WRAPPER_VERSION@">0.5.9</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="0.9.1.4">pysam</requirement>
+            <requirement type="package" version="@WRAPPER_VERSION@">ngsutils</requirement>
         </requirements>
     </xml>
     <xml name="version">

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/__init__.pyc

Binary file ngsutils/__init__.pyc has changed

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.py
--- a/ngsutils/bam/__init__.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b"@@ -1,880 +0,0 @@\n-import os\n-import re\n-import sys\n-\n-import ngsutils.support\n-import pysam\n-try:\n- from eta import ETA\n-except:\n- pass\n-\n-\n-def bam_open(fname, mode='r', *args, **kwargs):\n- if fname.lower()[-4:] == '.bam':\n- return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)\n- return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)\n-\n-\n-def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):\n- if not quiet and bam.filename:\n- eta = ETA(os.stat(bam.filename).st_size)\n- else:\n- eta = None\n-\n- for pileup in bam.pileup(mask=mask):\n- pos = bam.tell()\n- bgz_offset = pos >> 16\n-\n- if not quiet:\n- if callback:\n- eta.print_status(bgz_offset, extra=callback(pileup))\n- else:\n- eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))\n-\n- yield pileup\n-\n- if eta:\n- eta.done()\n-\n-\n-def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):\n- '''\n- >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]\n- ['A', 'B', 'E', 'C', 'D', 'F', 'Z']\n- '''\n-\n- if os.path.exists('%s.bai' % bam.filename):\n- # This is an indexed file, so it is ref sorted...\n- # Meaning that we should show chrom:pos, instead of read names\n- show_ref_pos = True\n-\n- eta = None\n-\n- if not ref:\n- if not quiet and bam.filename:\n- eta = ETA(os.stat(bam.filename).st_size)\n-\n- for read in bam:\n- pos = bam.tell()\n- bgz_offset = pos >> 16\n-\n- if not quiet and eta:\n- if callback:\n- eta.print_status(bgz_offset, extra=callback(read))\n- elif (show_ref_pos):\n- if read.tid > -1:\n- eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n- else:\n- eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))\n- else:\n- eta.print_status(bgz_offset, extra='%s' % read.qname)\n-\n- yield read\n-\n- else:\n- working_chrom = None\n- if ref in bam.references:\n- working_chrom = ref\n- elif ref[0:3] == 'chr':\n- # compensate for Ensembl vs UCSC ref naming\n- if ref[3:] in bam.references:\n- working_chrom = ref[3:]\n-\n- if not working_chrom:\n- raise ValueError('Missing reference: %s' % ref)\n-\n- tid = bam.gettid(working_chrom)\n-\n- if not start:\n- start = 0\n- if not end:\n- end = bam.lengths[tid]\n-\n- if not quiet and bam.filename:\n- eta = ETA(end - start)\n-\n- for read in bam.fetch(working_chrom, start, end):\n- if not quiet and eta:\n- if callback:\n- eta.print_status(read.pos - start, extra=callback(read))\n- else:\n- eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n-\n- yield read\n-\n- if eta:\n- eta.done()\n-\n-\n-def bam_batch_reads(bam, quiet=False):\n- '''\n- Batch mapping for the same reads (qname) together, this way\n- they can all be compared/converted together.\n- '''\n- reads = []\n- last = None\n- for read in bam_iter(bam, quiet=quiet):\n- if last and read.qname != last:\n- yield reads\n- reads = []\n- last = read.qname\n- reads.append(read)\n-\n- if reads:\n- yield reads\n-\n-\n-bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']\n-bam_cigar_op = {\n- 'M': 0,\n- 'I': 1,\n- 'D': 2,\n- 'N': 3,\n- 'S': 4,\n- 'H': 5,\n- 'P': 6,\n- '=': 7,\n- 'X': 8,\n-}\n-\n-\n-def cigar_fromstr(s):\n- '''\n- >>> cigar_fromstr('10M5I20M')\n- [(0, 10), (1, 5), ("..b'- return True, \'\'\n-\n-\n-def read_alignment_fragments_gen(read):\n- \'\'\'\n- Takes a read and returns the start and end positions for each match/mismatch region.\n- This will let us know where each read alignment "touches" the genome.\n- \'\'\'\n-\n- for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):\n- yield (start, end)\n-\n-\n-def _read_alignment_fragments_gen(pos, cigar):\n- \'\'\' Test-able version of read_alignment_fragments_gen\n- >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'50M\')))\n- [(1, 51)]\n-\n- >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'25M100N25M\')))\n- [(1, 26), (126, 151)]\n-\n- >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'20M1D4M100N10M5I10M\')))\n- [(1, 21), (22, 26), (126, 136), (136, 146)]\n- \'\'\'\n- ref_pos = pos\n-\n- for op, length in cigar:\n- if op == 0:\n- yield (ref_pos, ref_pos + length)\n- ref_pos += length\n- elif op == 1:\n- pass\n- elif op == 2:\n- ref_pos += length\n- elif op == 3:\n- ref_pos += length\n- else:\n- raise ValueError("Unsupported CIGAR operation: %s" % op)\n-\n-\n-def read_cigar_at_pos(cigar, qpos, is_del):\n- \'\'\'\n- Returns the CIGAR operation for a given read position\n-\n- qpos is the 0-based index of the base in the read\n- \'\'\'\n- pos = 0\n- returnnext = False\n- for op, length in cigar:\n- if returnnext:\n- return op\n- if op == 0:\n- pos += length\n- elif op == 1:\n- pos += length\n- elif op == 2:\n- pass\n- elif op == 3:\n- pass\n- elif op == 4:\n- pos += length\n- elif op == 5:\n- pass\n- else:\n- raise ValueError("Unsupported CIGAR operation: %s" % op)\n-\n- if pos > qpos:\n- return op\n- if is_del and pos == qpos:\n- returnnext = True\n-\n- return None\n-\n-\n-def cleancigar(cigar):\n- \'\'\'\n- Cleans a CIGAR alignment to remove zero length operations\n-\n- >>> cigar_tostr(cleancigar(cigar_fromstr(\'31M0N52M18S\')))\n- \'83M18S\'\n-\n- \'\'\'\n- newcigar = []\n-\n- changed = False\n- zero = False\n-\n- for op, size in cigar:\n- if size > 0:\n- if zero and newcigar:\n- if newcigar[-1][0] == op:\n- newsize = newcigar[-1][1] + size\n- newcigar[-1] = (op, newsize)\n- zero = 0\n- else:\n- newcigar.append((op, size))\n- else:\n- changed = True\n- zero = True\n-\n- if changed:\n- return newcigar\n-\n- return None\n-\n-\n-def read_cleancigar(read):\n- \'\'\'\n- Replaces the CIGAR string for a read to remove any operations that are zero length.\n-\n- This happens to the read in-place\n- \'\'\'\n- if read.is_unmapped:\n- return False\n-\n- newcigar = []\n-\n- newcigar = cleancigar(read.cigar)\n-\n- if newcigar:\n- read.cigar = newcigar\n- return True\n-\n- return False\n-\n-\n-def read_to_unmapped(read, ref=None):\n- \'\'\'\n- Converts a read from mapped to unmapped.\n-\n- Sets the \'ZR\' tag to indicate the original ref/pos/cigar (if ref is passed)\n- \'\'\'\n-\n- newread = pysam.AlignedRead()\n-\n- if ref:\n- tags = [(\'ZR\', \'%s:%s:%s\' % (ref, read.pos, cigar_tostr(read.cigar)))]\n-\n- newread.is_unmapped = True\n- newread.mapq = 0\n- newread.tlen = 0\n- newread.pos = -1\n- newread.pnext = -1\n- newread.rnext = -1\n- newread.tid = -1\n-\n- newread.qname = read.qname\n-\n- if read.is_paired:\n- newread.is_paired = True\n-\n- if not read.is_unmapped and read.is_reverse:\n- newread.seq = ngsutils.support.revcomp(read.seq)\n- newread.qual = read.qual[::-1]\n- else:\n- newread.seq = read.seq\n- newread.qual = read.qual\n-\n- newread.tags = tags\n-\n- return newread\n-\n-\n-if __name__ == \'__main__\':\n- import doctest\n- doctest.testmod()\n'

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.pyc

Binary file ngsutils/bam/__init__.pyc has changed

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.py
--- a/ngsutils/bed/__init__.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,282 +0,0 @@\n-import os\n-\n-import ngsutils.support.ngs_utils\n-import pysam\n-\n-\n-class BedStreamer(object):\n- \'\'\'\n- Streams BedRegions from a BED file\n-\n- Note - this can only be used once! There is no mechanism to seek the stream.\n- \'\'\'\n-\n- def __init__(self, fname=None, fileobj=None, quiet=False):\n- if not fname and not fileobj:\n- raise ValueError("You must specify either fname or fileobj!")\n-\n- self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)\n-\n- def __iter__(self):\n- return self\n-\n- def next(self):\n- try:\n- while True:\n- line = self.reader.next().strip()\n- if line and line[0] != \'#\':\n- cols = line.split(\'\\t\')\n- while len(cols) < 6:\n- cols.append(\'\')\n-\n- return BedRegion(*cols)\n- except:\n- raise StopIteration\n-\n-\n-class BedFile(object):\n- \'\'\'\n- BED files are read in their entirety memory, in a series of bins. Each bin\n- is ~100kb in size. Each bin can then be iterated over.\n-\n- This is less efficient than using a proper index, but in reality, this\n- usually isn\'t an issue. However, if the BED file has been Tabix indexed,\n- that index will be used for random access.\n-\n- NOTE: This isn\'t very efficient, so perhaps this can be remodeled into a BedFile\n- and a BedFileIndex where the file is indexed only if random access is requested.\n- \'\'\'\n-\n- _bin_const = 100000\n-\n- def __init__(self, fname=None, fileobj=None, region=None):\n- self._bins = {}\n- self._bin_list = []\n- self._cur_bin_idx = 0\n- self._cur_bin_pos = 0\n- self._tellpos = 0\n- self._total = 0\n- self._length = 0\n- self.__tabix = None\n-\n- self.filename = fname\n-\n- if os.path.exists(\'%s.tbi\' % fname):\n- self.__tabix = pysam.Tabixfile(fname)\n-\n- if fileobj:\n- self.__readfile(fileobj)\n- elif fname:\n- with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:\n- self.__readfile(fobj)\n- elif region:\n- chrom, startend = region.split(\':\')\n- if \'-\' in startend:\n- start, end = [int(x) for x in startend.split(\'-\')]\n- else:\n- start = int(startend)\n- end = start\n- start -= 1\n-\n- self.__add_region(BedRegion(chrom, start, end))\n- else:\n- raise ValueError("Must specify either filename, fileobj, or region")\n-\n- def __readfile(self, fobj):\n- for line in fobj:\n- line = line.strip()\n- if line and line[0] != \'#\':\n- cols = line.split(\'\\t\')\n- while len(cols) < 6:\n- cols.append(\'\')\n-\n- region = BedRegion(*cols)\n- self.__add_region(region)\n-\n- self._bin_list.sort()\n- for bin in self._bins:\n- self._bins[bin].sort()\n-\n- def __add_region(self, region):\n- self._total += region.end - region.start\n- self._length += 1\n-\n- startbin = region.start / BedFile._bin_const\n- endbin = region.end / BedFile._bin_const\n-\n- for bin in xrange(startbin, endbin + 1):\n- if not (region.chrom, bin) in self._bins:\n- self._bin_list.append((region.chrom, bin))\n- self._bins[(region.chrom, bin)] = []\n- self._bins[(region.chrom, bin)].append(region)\n-\n- def fetch(self, chrom, start, end, strand=None):\n- \'\'\'\n- For TABIX indexed BED files, find all regions w/in a range\n-\n- For non-TABIX index BED files, use the calculated bins, and\n- output matching regions\n- \'\'\'\n-\n- if self.__tabix:\n- for match in self.__tabix.fetch(chrom, start, end):\n- region = BedRegion(*match.split(\'\\t\'))\n- if not strand or (strand'..b"\n-\n- def close(self):\n- pass\n-\n- @property\n- def length(self):\n- return self._length\n-\n- @property\n- def total(self):\n- return self._total\n-\n- def __iter__(self):\n- self._cur_bin_idx = 0\n- self._cur_bin_pos = 0\n- self._tellpos = 0\n- return self\n-\n- def next(self):\n- if self._cur_bin_idx >= len(self._bin_list):\n- raise StopIteration\n-\n- binvals = self._bins[self._bin_list[self._cur_bin_idx]]\n- while self._cur_bin_pos < len(binvals):\n- val = binvals[self._cur_bin_pos]\n- self._cur_bin_pos += 1\n-\n- startbin = (val.chrom, val.start / BedFile._bin_const)\n- if startbin == self._bin_list[self._cur_bin_idx]:\n- self._tellpos += 1\n- return val\n-\n- self._cur_bin_idx += 1\n- self._cur_bin_pos = 0\n- return self.next()\n-\n-\n-class BedRegion(object):\n- def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):\n- self.chrom = chrom\n- self.start = int(start)\n- self.end = int(end)\n- self.name = name\n-\n- if score == '':\n- self.score = 0\n- else:\n- self.score = float(score)\n-\n- if strand == '':\n- self.strand = None\n- else:\n- self.strand = strand\n-\n- if thickStart == '':\n- self.thickStart = None\n- else:\n- self.thickStart = thickStart\n-\n- if thickEnd == '':\n- self.thickEnd = None\n- else:\n- self.thickEnd = thickEnd\n-\n- if rgb == '':\n- self.rgb = None\n- else:\n- self.rgb = rgb\n-\n- self.extras = args\n-\n- def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):\n- cols = []\n- cols.append(self.chrom if chrom is None else chrom)\n- cols.append(self.start if start is None else start)\n- cols.append(self.end if end is None else end)\n- cols.append(self.name if name is None else name)\n- cols.append(self.score if score is None else score)\n- cols.append(self.strand if strand is None else strand)\n- cols.append(self.thickStart if thickStart is None else thickStart)\n- cols.append(self.thickEnd if thickEnd is None else thickEnd)\n- cols.append(self.rgb if rgb is None else rgb)\n-\n- for i, val in enumerate(self.extras):\n- if len(args) > i:\n- cols.append(args[i])\n- else:\n- cols.append(val)\n-\n- return BedRegion(*cols)\n-\n- @property\n- def score_int(self):\n- score = str(self.score)\n- if score[-2:] == '.0':\n- score = score[:-2]\n-\n- return score\n-\n- def __key(self):\n- return (self.chrom, self.start, self.end, self.strand, self.name)\n-\n- def __lt__(self, other):\n- return self.__key() < other.__key()\n-\n- def __gt__(self, other):\n- return self.__key() > other.__key()\n-\n- def __eq__(self, other):\n- return self.__key() == other.__key()\n-\n- def write(self, out):\n- out.write('%s\\n' % self)\n-\n- def __repr__(self):\n- outcols = []\n-\n- if self.rgb:\n- outcols.append(self.rgb)\n- if self.thickEnd or outcols:\n- outcols.append(self.thickEnd if self.thickEnd else self.end)\n- if self.thickStart or outcols:\n- outcols.append(self.thickStart if self.thickStart else self.start)\n- if self.strand or outcols:\n- outcols.append(self.strand)\n- if self.score_int != '' or outcols:\n- outcols.append(self.score_int)\n- if self.name or outcols:\n- outcols.append(self.name)\n-\n- outcols.append(self.end)\n- outcols.append(self.start)\n- outcols.append(self.chrom)\n-\n- return '\\t'. join([str(x) for x in outcols[::-1]])\n"

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.pyc

Binary file ngsutils/bed/__init__.pyc has changed

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.py
--- a/ngsutils/support/__init__.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,254 +0,0 @@
-import collections
-import gzip
-import os
-import re
-import sys
-try:
-    from eta import ETA
-except:
-    pass
-
-
-class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
-    def __repr__(self):
-        if self.comment:
-            return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
-        return '>%s\n%s\n' % (self.name, self.seq)
-
-    def subseq(self, start, end, comment=None):
-        if self.comment:
-            comment = '%s %s' % (self.comment, comment)
-
-        return FASTARead(self.name, comment, self.seq[start:end])
-
-    def clone(self, name=None, comment=None, seq=None):
-        n = name if name else self.name
-        c = comment if comment else self.comment
-        s = seq if seq else self.seq
-
-        return FASTARead(n, c, s)
-
-    def write(self, out):
-        out.write(repr(self))
-
-
-class FASTA(object):
-    def __init__(self, fname=None, fileobj=None, qual=False):
-        self.fname = fname
-        self.qual = qual
-        if fileobj:
-            self.fileobj = fileobj
-        else:
-            if self.fname == '-':
-                self.fileobj = sys.stdin
-            elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
-                self.fileobj = gzip.open(os.path.expanduser(self.fname))
-            else:
-                self.fileobj = open(os.path.expanduser(self.fname))
-
-        if not self.fileobj:
-            raise ValueError("Missing valid filename or fileobj")
-
-    def close(self):
-        if self.fileobj != sys.stdout:
-            self.fileobj.close()
-
-    def tell(self):
-        # always relative to uncompressed...
-        return self.fileobj.tell()
-
-    def seek(self, pos, whence=0):
-        self.fileobj.seek(pos, whence)
-
-    def fetch(self, quiet=False):
-        name = ''
-        comment = ''
-        seq = ''
-
-        if not quiet and self.fname and self.fname != '-':
-            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
-        else:
-            eta = None
-
-        for line in self.fileobj:
-            line = line.strip()
-            if not line:
-                continue
-            if line[0] == '#':
-                continue
-
-            if line[0] == '>':
-                if name and seq:
-                    if eta:
-                        eta.print_status(extra=name)
-                    yield FASTARead(name, comment, seq)
-
-                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
-                name = spl[0]
-                if len(spl) > 1:
-                    comment = spl[1]
-                else:
-                    comment = ''
-                seq = ''
-
-            else:
-                if self.qual:
-                    seq = seq + ' ' + line
-                else:
-                    seq += line
-
-        if name and seq:
-            if eta:
-                eta.print_status(extra=name)
-            yield FASTARead(name, comment, seq)
-
-        if eta:
-            eta.done()
-
-
-def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
-    if fileobj:
-        f = fileobj
-    elif fname == '-':
-        f = sys.stdin
-    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
-        f = gzip.open(os.path.expanduser(fname))
-    else:
-        f = open(os.path.expanduser(fname))
-
-    if quiet or fname == '-':
-        eta = None
-    else:
-        eta = ETA(os.stat(fname).st_size, fileobj=f)
-
-    for line in f:
-        if eta:
-            if callback:
-                extra = callback()
-            else:
-                extra = ''
-
-            eta.print_status(extra=extra)
-        yield line
-
-        if done_callback and done_callback():
-                break
-
-    if f != sys.stdin:
-        f.close()
-
-    if eta:
-        eta.done()
-
-
-class Symbolize(object):
-    'Converts strings to symbols - basically a cache of strings'
-
-    def __init__(self):
-        self.__cache = {}
-
-    def __getitem__(self, k):
-        if k not in self.__cache:
-            self.__cache[k] = k
-
-        return self.__cache[k]
-
-
-symbols = Symbolize()
-
-_compliments = {
-    'a': 't',
-    'A': 'T',
-    'c': 'g',
-    'C': 'G',
-    'g': 'c',
-    'G': 'C',
-    't': 'a',
-    'T': 'A',
-    'n': 'n',
-    'N': 'N'
-}
-
-
-def revcomp(seq):
-    '''
-    >>> revcomp('ATCGatcg')
-    'cgatCGAT'
-    '''
-    ret = []
-
-    for s in seq:
-        ret.append(_compliments[s])
-
-    ret.reverse()
-    return ''.join(ret)
-
-
-class Counts(object):
-    '''
-    Setup simple binning.  Bins are continuous 0->max.  Values are added to
-    bins and then means / distributions can be calculated.
-    '''
-
-    def __init__(self):
-        self.bins = []
-
-    def add(self, val):
-        while len(self.bins) <= val:
-            self.bins.append(0)
-        self.bins[val] += 1
-
-    def mean(self):
-        acc = 0
-        count = 0
-
-        for i, val in enumerate(self.bins):
-            acc += (i * val)
-            count += val
-
-        if count > 0:
-            return float(acc) / count
-
-    def max(self):
-        return len(self.bins) - 1
-
-
-def memoize(func):
-    if 'TESTING' in os.environ or 'DEBUG' in os.environ:
-        return func
-
-    __cache = {}
-
-    def inner(*args, **kwargs):
-        k = (args, tuple(kwargs.iteritems()))
-        if k not in __cache:
-            __cache[k] = func(*args, **kwargs)
-        return __cache[k]
-
-    inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
-    return inner
-
-
-def quoted_split(s, delim, quote_char='"'):
-    tokens = []
-
-    buf = ""
-    inquote = False
-
-    for c in s:
-        if inquote:
-            buf += c
-            if c == quote_char:
-                inquote = False
-        elif c == delim:
-            tokens.append(buf)
-            buf = ""
-        else:
-            buf += c
-            if c == quote_char:
-                inquote = True
-
-    if buf:
-        tokens.append(buf)
-
-    return tokens

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.pyc

Binary file ngsutils/support/__init__.pyc has changed

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/bgzip.py
--- a/ngsutils/support/bgzip.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,138 +0,0 @@
-#!/usr/bin/env python
-'''
-Extract the blocks from a BGZip file.
-
-BAM files are stored as blocks in a bgzip archive. This class
-will load the bgzip archive and output the block information.
-'''
-
-import os
-import struct
-import sys
-
-
-class BGZip(object):
-    def __init__(self, fname):
-        self.fname = fname
-        self.pos = 0
-        self.fileobj = open(self.fname)
-        self.fsize = os.stat(self.fname).st_size
-        self.cur_chunk = 0
-
-        self.cpos = 0
-        self.cdata = 0
-
-    def close(self):
-        self.fileobj.close()
-
-    def seek(self, offset):
-        bgz_offset = offset >> 16
-        chunk_offset = offset & 0xFFFF
-
-        self.fileobj.seek(bgz_offset)
-        self.read_chunk()
-        self.chunk_pos = chunk_offset
-
-    def read(self, amount, whence=0):
-        if whence not in [0, 1]:
-            print "Bad Whence value!: %s" % whence
-            return
-
-        if whence == 0:
-            self.seek(0, 0)
-
-        # read into chunk, if not enough data in chunk, read next chunk
-        ret = ''
-        while amount and self.pos < self.fsize:
-            if len(self.cdata) - self.cpos < amount:
-                ret += self.cdata[self.cpos:self.cpos + amount]
-                self.cpos += amount
-                return ret
-            else:
-                ret += self.cdata[self.cpos:]
-                amount = amount - len(ret)
-                self.read_chunk()
-        return ret
-
-    def read_chunk(self):
-        self.fileobj.seek(10)
-        id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
-        subpos = 0
-        bsize = 0
-
-        while subpos < xlen:
-            si1, si2, slen = self._read_fields('<BBH')
-            if si1 == 66 and si2 == 67:
-                bsize, = self._read_fields('<H')
-            else:
-                self.fileobj.seek(slen, 1)
-                self.pos += slen
-
-            subpos += 6 + slen
-
-        cdata_size = bsize - xlen - 19
-        self.cdata = self.fileobj.read(cdata_size)  # inflate value
-        self.fileobj.seek(8)
-
-        self.cur_chunk += 1
-        self.cpos = 0
-
-    def dump(self):
-        self.fileobj.seek(0)
-        block_num = 0
-
-        while self.pos < self.fsize:
-            print "[BLOCK %s]" % block_num
-            # read header
-            id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
-            print 'id1: %s' % id1
-            print 'id2: %s' % id2
-            print 'cm: %s' % cm
-            print 'flg: %s' % flg
-            print 'mtime: %s' % mtime
-            print 'xfl: %s' % xfl
-            print 'os: %s' % os
-            print 'xlen: %s' % xlen
-
-            # read subfields
-            subpos = 0
-            bsize = 0
-
-            while subpos < xlen:
-                si1, si2, slen = self._read_fields('<BBH')
-                print '    si1: %s' % si1
-                print '    si1: %s' % si2
-                print '    slen: %s' % slen
-                print '    data: [%s]' % slen
-
-                if si1 == 66 and si2 == 67:
-                    bsize, = self._read_fields('<H')
-                else:
-                    self.fileobj.seek(slen, 1)
-                    self.pos += slen
-
-                subpos += 6 + slen
-
-            cdata_size = bsize - xlen - 19
-
-            print 'bsize: %s' % bsize
-            print 'cdata: [%s]' % (cdata_size)
-
-            self.fileobj.seek(cdata_size, 1)
-            self.pos += cdata_size
-            crc, isize = self._read_fields('<II')
-
-            print "crc: %s" % crc
-            print "isize: %s" % isize
-            # print "POS: %s" % self.pos
-
-            block_num += 1
-
-    def _read_fields(self, field_types):
-        size = struct.calcsize(field_types)
-        self.pos += size
-        return struct.unpack(field_types, self.fileobj.read(size))
-
-
-if __name__ == '__main__':
-    print BGZip(sys.argv[1]).dump()

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.py
--- a/ngsutils/support/dbsnp.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,142 +0,0 @@
-'''
-Support package for processing a dbSNP tabix dump from UCSC.
-'''
-
-import collections
-import sys
-
-import pysam
-from ngsutils.support import revcomp
-
-
-class SNPRecord(collections.namedtuple('SNPRecord', '''bin
-chrom
-chromStart
-chromEnd
-name
-score
-strand
-refNCBI
-refUCSC
-observed
-molType
-clazz
-valid
-avHet
-avHetSE
-func
-locType
-weight
-exceptions
-submitterCount
-submitters
-alleleFreqCount
-alleles
-alleleNs
-alleleFreqs
-bitfields
-''')):
-    __slots__ = ()
-
-    @property
-    def alleles(self):
-        alts = []
-        for alt in self.observed.split('/'):
-            if alt != '-' and self.strand == '-':
-                alt = revcomp(alt)
-
-            alts.append(alt)
-
-        return alts
-
-    @property
-    def snp_length(self):
-        return self.chromEnd - self.chromStart
-
-
-def autotype(ar, length=26):
-    out = []
-    for el in ar:
-        val = None
-        try:
-            val = int(el)
-        except:
-            try:
-                val = float(el)
-            except:
-                val = el
-
-        out.append(val)
-
-    if len(out) < 12:
-        raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.")
-
-    while len(out) < length:
-        out.append(None)
-    return out
-
-
-class DBSNP(object):
-    def __init__(self, fname):
-        self.dbsnp = pysam.Tabixfile(fname)
-        self.asTup = pysam.asTuple()
-
-    def fetch(self, chrom, pos):
-        'Note: pos is 0-based'
-
-        # Note: tabix the command uses 1-based positions, but
-        #       pysam.Tabixfile uses 0-based positions
-
-        for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup):
-            snp = SNPRecord._make(autotype(tup))
-            if snp.chromStart == pos:
-                yield snp
-
-    def close(self):
-        self.dbsnp.close()
-
-    def dump(self, chrom, op, pos, base, snp, exit=True):
-        print
-        print ' ->', op, chrom, pos, base
-        print snp
-        print snp.alleles
-        if exit:
-            sys.exit(1)
-
-    def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
-        for snp in self.fetch(chrom, pos):
-            if '/' not in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
-                # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
-                continue
-
-            if op == 0:
-                if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
-                    return True
-                # elif verbose:
-                #     for alt in snp.alleles:
-                #         if len(alt) > 1:
-                #             self.dump(chrom, op, pos, seq, snp, False)
-
-            elif op == 1:
-                if snp.clazz in ['insertion', 'mixed', 'in-del']:
-                    if seq in snp.alleles:
-                        if len(seq) > 1 and verbose:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        return True
-
-                    if verbose:
-                        if len(seq) > 1:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        else:
-                            for alt in snp.alleles:
-                                if len(alt) > 1:
-                                    self.dump(chrom, op, pos, seq, snp, False)
-
-            elif op == 2:
-                if snp.clazz in ['deletion', 'mixed', 'in-del']:
-                    if '-' in snp.alleles and seq in snp.alleles:
-                        if len(seq) > 1 and verbose:
-                            self.dump(chrom, op, pos, seq, snp, False)
-                        return True
-
-        return False

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.pyc

Binary file ngsutils/support/dbsnp.pyc has changed

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/llh.py
--- a/ngsutils/support/llh.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,56 +0,0 @@
-'''
-Methods for calculating log-likelihoods for nucleotide frequencies
-'''
-import collections
-import math
-
-from ngsutils.support import memoize
-
-_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
-
-NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
-
-
-@memoize
-def pseudo_count(N, bg):
-    '''
-    >>> pseudo_count(100, _default_background['A'])
-    3
-    >>> pseudo_count(100, _default_background['C'])
-    2
-    '''
-
-    return bg * math.sqrt(N)
-
-
-def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
-    if pseudo == 'auto':
-        N = A + C + G + T
-        Ap = float(A) + pseudo_count(N, bg['A'])
-        Cp = float(C) + pseudo_count(N, bg['C'])
-        Gp = float(G) + pseudo_count(N, bg['G'])
-        Tp = float(T) + pseudo_count(N, bg['T'])
-    elif pseudo:
-        Ap = float(A) + pseudo
-        Cp = float(C) + pseudo
-        Gp = float(G) + pseudo
-        Tp = float(T) + pseudo
-    else:
-        Ap = float(A)
-        Cp = float(C)
-        Gp = float(G)
-        Tp = float(T)
-
-    Np = Ap + Cp + Gp + Tp
-
-    freqA = Ap / Np
-    freqC = Cp / Np
-    freqG = Gp / Np
-    freqT = Tp / Np
-
-    return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.py
--- a/ngsutils/support/ngs_utils.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,225 +0,0 @@
-#!/usr/bin/env python
-"""
-Common util classes / functions for the NGS project
-"""
-import collections
-import gzip
-import os
-import re
-import sys
-
-
-def format_number(n):
-    '''
-    >>> format_number(1000)
-    '1,000'
-    >>> format_number(1234567)
-    '1,234,567'
-    '''
-    ar = list(str(n))
-    for i in range(len(ar))[::-3][1:]:
-        ar.insert(i + 1, ',')
-    return ''.join(ar)
-
-
-def natural_sort(ar):
-    '''
-    >>> natural_sort('1 3 4 2 5'.split())
-    ['1', '2', '3', '4', '5']
-    >>> natural_sort('1 10 20 2 3 4'.split())
-    ['1', '2', '3', '4', '10', '20']
-    '''
-    to_sort = []
-    for item in ar:
-        spl = re.split('(\d+)', item)
-        l2 = []
-        for el in spl:
-            try:
-                n = int(el)
-            except:
-                n = el
-            l2.append(n)
-        to_sort.append((l2, item))
-
-    to_sort.sort()
-    return [x[1] for x in to_sort]
-
-
-def dictify(values, colnames):
-    """
-    Convert a list of values into a dictionary based upon given column names.
-
-    If the column name starts with an '@', the value is assumed to be a comma
-    separated list.
-
-    If the name starts with a '#', the value is assumed to be an int.
-
-    If the name starts with '@#', the value is assumed to  a comma separated
-    list of ints.
-
-    """
-    d = {}
-    for i in xrange(len(colnames)):
-        key = colnames[i]
-        split = False
-        num = False
-
-        if key[0] == '@':
-            key = key[1:]
-            split = True
-        if key[0] == '#':
-            key = key[1:]
-            num = True
-
-        if i < len(values):
-            if num and split:
-                val = [int(x) for x in values[i].rstrip(',').split(',')]
-            elif num:
-                val = int(values[i])
-            elif split:
-                val = values[i].rstrip(',').split(',')
-            else:
-                val = values[i]
-
-            d[key] = val
-
-        else:
-            d[key] = None
-
-    return d
-
-
-def gzip_aware_open(fname):
-    if fname == '-':
-        f = sys.stdin
-    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
-        f = gzip.open(os.path.expanduser(fname))
-    else:
-        f = open(os.path.expanduser(fname))
-    return f
-
-
-class gzip_opener:
-    '''
-    A Python 2.6 class to handle 'with' opening of text files that may
-    or may not be gzip compressed.
-    '''
-
-    def __init__(self, fname):
-        self.fname = fname
-
-    def __enter__(self):
-        self.f = gzip_aware_open(self.fname)
-        return self.f
-
-    def __exit__(self, type, value, traceback):
-        if self.f != sys.stdin:
-            self.f.close()
-        return False
-
-
-def filenames_to_uniq(names, new_delim='.'):
-    '''
-    Given a set of file names, produce a list of names consisting of the
-    uniq parts of the names. This works from the end of the name.  Chunks of
-    the name are split on '.' and '-'.
-
-    For example:
-        A.foo.bar.txt
-        B.foo.bar.txt
-        returns: ['A','B']
-
-        AA.BB.foo.txt
-        CC.foo.txt
-        returns: ['AA.BB','CC']
-
-    >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
-    ['a', 'b']
-    >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
-    ['a.b', 'c']
-
-    '''
-    name_words = []
-    maxlen = 0
-    for name in names:
-        name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
-        name_words[-1].reverse()
-        if len(name_words[-1]) > maxlen:
-            maxlen = len(name_words[-1])
-
-    common = [False, ] * maxlen
-    for i in xrange(maxlen):
-        last = None
-        same = True
-        for nameword in name_words:
-            if i >= len(nameword):
-                same = False
-                break
-            if not last:
-                last = nameword[i]
-            elif nameword[i] != last:
-                same = False
-                break
-        common[i] = same
-
-    newnames = []
-    for nameword in name_words:
-        nn = []
-        for (i, val) in enumerate(common):
-            if not val and i < len(nameword):
-                nn.append(nameword[i])
-        nn.reverse()
-        newnames.append(new_delim.join(nn))
-    return newnames
-
-
-def parse_args(argv, defaults=None, expected_argc=0):
-    opts = {}
-    if defaults:
-        opts.update(defaults)
-
-    args = []
-
-    i = 0
-    while i < len(argv):
-        if argv[i][0] == '-':
-            arg = argv[i].lstrip('-')
-            if '=' in arg:
-                k, v = arg.split('=', 2)
-                if k in defaults:
-                    if type(defaults[k]) == float:
-                        opts[k] = float(v)
-                    elif type(defaults[k]) == int:
-                        opts[k] = int(v)
-                    else:
-                        opts[k] = v
-            else:
-                opts[arg] = True
-        else:
-            args.append(argv[i])
-        i += 1
-
-    while len(args) < expected_argc:
-        args.append(None)
-    return opts, args
-
-
-class memoize(object):
-    'Simple memoizing decorator to cache results'
-
-    def __init__(self, func):
-        self.func = func
-        self.cache = {}
-
-    def __call__(self, *args):
-        if not isinstance(args, collections.Hashable):
-            # uncacheable. a list, for instance.
-            # better to not cache than blow up.
-            return self.func(*args)
-
-        if args in self.cache:
-            return self.cache[args]
-        else:
-            value = self.func(*args)
-            self.cache[args] = value
-            return value

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.pyc

Binary file ngsutils/support/ngs_utils.pyc has changed

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/regions.py
--- a/ngsutils/support/regions.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,166 +0,0 @@
-class RangeMatch(object):
-    '''
-    Simple genomic ranges.  You can define chrom:start-end ranges, then ask if a
-    particular genomic coordinate maps to any of those ranges.  This is less-
-    efficient than an R-Tree, but easier to code.
-    '''
-
-    def __init__(self, name):
-        self.ranges = {}
-        self.name = name
-
-    def add_range(self, chrom, strand, start, end):
-        if chrom not in self.ranges:
-            self.ranges[chrom] = {}
-
-        bin = start / 100000
-        if bin not in self.ranges[chrom]:
-            self.ranges[chrom][bin] = []
-        self.ranges[chrom][bin].insert(0, (start, end, strand))
-
-        if (end / 100000) != bin:
-            for bin in xrange(bin + 1, (end / 100000) + 1):
-                if bin not in self.ranges[chrom]:
-                    self.ranges[chrom][bin] = []
-                self.ranges[chrom][bin].insert(0, (start, end, strand))
-
-    def get_tag(self, chrom, strand, pos, ignore_strand=False):
-        '''
-        returns (region, is_reverse_orientation)
-        '''
-        if chrom not in self.ranges:
-            return None, False
-        bin = pos / 100000
-        if bin not in self.ranges[chrom]:
-            return None, False
-        for start, end, r_strand in self.ranges[chrom][bin]:
-            if pos >= start and pos <= end:
-                if ignore_strand or strand == r_strand:
-                    return self.name, False
-                return self.name, True
-        return None, False
-
-
-class RegionTagger(object):
-    def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
-        self.regions = []
-        self.counts = {}
-        self.only_first_fragment = only_first_fragment
-
-        coding = RangeMatch('coding')
-        exons = RangeMatch('other-exon')
-        utr_5 = RangeMatch('utr-5')
-        utr_3 = RangeMatch('utr-3')
-        introns = RangeMatch('intron')
-        promoters = RangeMatch('promoter')
-
-        for gene in gtf.genes:
-            if valid_chroms and gene.chrom not in valid_chroms:
-                continue
-            if gene.strand == '+':
-                promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
-            else:
-                promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
-
-            for transcript in gene.transcripts:
-                if transcript.has_cds:
-                    for start, end in transcript.cds:
-                        coding.add_range(gene.chrom, gene.strand, start, end)
-
-                    # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
-                    for s, e in transcript.utr_5:
-                        utr_5.add_range(gene.chrom, gene.strand, s, e)
-                    for s, e in transcript.utr_3:
-                        utr_3.add_range(gene.chrom, gene.strand, s, e)
-
-                last_end = None
-                for start, end in transcript.exons:
-                    if last_end:
-                        introns.add_range(gene.chrom, gene.strand, last_end, start)
-                    exons.add_range(gene.chrom, gene.strand, start, end)
-                    last_end = end
-
-        self.regions.append(coding)
-        self.regions.append(utr_5)
-        self.regions.append(utr_3)
-        self.regions.append(exons)
-        self.regions.append(introns)
-        self.regions.append(promoters)
-
-        self.counts['coding'] = 0
-        self.counts['coding-rev'] = 0
-        self.counts['other-exon'] = 0
-        self.counts['utr-5'] = 0
-        self.counts['utr-3'] = 0
-        self.counts['utr-5-rev'] = 0
-        self.counts['utr-3-rev'] = 0
-        self.counts['intron'] = 0
-        self.counts['promoter'] = 0
-        self.counts['other-exon-rev'] = 0
-        self.counts['intron-rev'] = 0
-        self.counts['promoter-rev'] = 0
-        self.counts['junction'] = 0
-        self.counts['intergenic'] = 0
-        self.counts['mitochondrial'] = 0
-
-    def add_read(self, read, chrom):
-        if read.is_unmapped:
-            return
-
-        if self.only_first_fragment and read.is_paired and not read.is_read1:
-            return
-
-        tag = None
-        is_rev = False
-
-        strand = '-' if read.is_reverse else '+'
-
-        if chrom == 'chrM':
-            tag = 'mitochondrial'
-
-        if not tag:
-            for op, length in read.cigar:
-                if op == 3:
-                    tag = 'junction'
-                    break
-
-        if not tag:
-            for region in self.regions:
-                tag, is_rev = region.get_tag(chrom, strand, read.pos)
-                if tag:
-                    break
-
-        if not tag:
-            tag = 'intergenic'
-
-        if tag:
-            if is_rev:
-                self.counts['%s-rev' % tag] += 1
-            else:
-                self.counts[tag] += 1
-
-        return tag
-
-    def tag_region(self, chrom, start, end, strand):
-        tag = None
-        is_rev = False
-
-        if chrom == 'chrM' or chrom == 'M':
-            tag = 'mitochondrial'
-
-        if not tag:
-            for region in self.regions:
-                tag, is_rev = region.get_tag(chrom, strand, start)
-                if is_rev:
-                    tag = '%s-rev' % tag
-
-                if start != end:
-                    endtag, is_rev = region.get_tag(chrom, strand, end)
-                    if is_rev:
-                        endtag = '%s-rev' % endtag
-
-                if tag and endtag and endtag != tag:
-                    tag = '%s/%s' % (tag, endtag)
-
-        if not tag:
-            tag = 'intergenic'

diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/stats.py
--- a/ngsutils/support/stats.py Sun Nov 27 15:01:21 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,166 +0,0 @@
-'''
-various statistical tests and methods...
-'''
-import math
-
-from ngsutils.support import memoize
-
-
-def median(vals):
-    '''
-    >>> median([1,2,3])
-    2
-    >>> median([1,2,3,4])
-    2.5
-    '''
-    vals.sort()
-
-    if len(vals) % 2 == 1:
-        return vals[len(vals) / 2]
-    else:
-        a = vals[(len(vals) / 2) - 1]
-        b = vals[(len(vals) / 2)]
-        return float(a + b) / 2
-
-
-def mean_stdev(l):
-    '''
-    >>> mean_stdev([1,2,2,2])
-    (1.75, 0.5)
-    >>> mean_stdev([2,2,2,2])
-    (2.0, 0.0)
-    '''
-
-    acc = 0
-    for el in l:
-        acc += el
-
-    mean = float(acc) / len(l)
-    acc = 0
-    for el in l:
-        acc += (el - mean) ** 2
-
-    if len(l) > 2:
-        stdev = math.sqrt(float(acc) / (len(l) - 1))
-    else:
-        stdev = 0.0
-
-    return (mean, stdev)
-
-
-def counts_median(d):
-    '''
-    Calculate the median from counts stored in a dictionary
-    >>> counts_median({ 1: 4, 2: 1, 3: 4 })
-    2
-    >>> counts_median({ 1: 4, 3: 4 })
-    2
-
-    '''
-    count = 0
-    for k in d:
-        count += d[k]
-
-    if count == 0:
-        return 0
-
-    acc = 0.0
-    last = 0
-    for k in sorted(d):
-        if last:
-            return (last + k) / 2
-        acc += d[k]
-        if acc / count == 0.5:
-            last = k
-        elif acc / count > 0.5:
-            return k
-
-
-def counts_mean_stdev(d):
-    '''
-
-    calc mean / stdev when data is stored as counts in a dictionary
-
-    Ex:
-        { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
-
-    >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
-    (2.0, 1.0)
-
-    '''
-
-    acc = 0
-    count = 0
-    for k in d:
-        acc += k * d[k]
-        count += d[k]
-
-    mean = float(acc) / count
-
-    acc = 0
-    for k in d:
-        acc += (((k - mean) ** 2) * d[k])
-
-    if count > 2:
-        stdev = math.sqrt(float(acc) / (count - 1))
-    else:
-        stdev = 0.0
-
-    return (mean, stdev)
-
-
-@memoize
-def poisson_prob(x, mean):
-    '''
-        Return the probability that you could get x counts in
-        a Poisson test with a mean value.
-
-        prob(x) = sum(i=1..x){poisson(i)}
-
-        >>> poisson_prob(6,10)
-        0.1300960209527205
-        >>> poisson_prob(8,10)
-        0.33277427882095645
-    '''
-    acc = 0.0
-    for i in xrange(1, x + 1):
-        acc += poisson_func(i, mean)
-    return acc
-
-
-@memoize
-def poisson_func(mu, lambd):
-    '''
-        This is the Poisson distribution function
-
-        p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
-
-        mu is a count
-        lambd is the mean
-
-        >>> poisson_func(1,10)
-        0.00045399929762484856
-        >>> poisson_func(2,10)
-        0.0022699964881242427
-        >>> poisson_func(3,10)
-        0.007566654960414142
-    '''
-    return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
-
-
-@memoize
-def _factorial(x):
-    '''
-    >>> _factorial(1)
-    1
-    >>> _factorial(2)
-    2
-    >>> _factorial(3)
-    6
-    '''
-    return math.factorial(x)
-
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()