Galaxy |

Changeset 0:4e4e4093d65d (2015-11-11)

Next changeset 1:8187a729d9f4 (2015-12-06)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068

added:
bam_filter.xml
filter.py
macros.xml
ngsutils/__init__.py
ngsutils/__init__.pyc
ngsutils/bam/__init__.py
ngsutils/bam/__init__.pyc
ngsutils/bed/__init__.py
ngsutils/bed/__init__.pyc
ngsutils/support/__init__.py
ngsutils/support/__init__.pyc
ngsutils/support/bgzip.py
ngsutils/support/dbsnp.py
ngsutils/support/dbsnp.pyc
ngsutils/support/llh.py
ngsutils/support/ngs_utils.py
ngsutils/support/ngs_utils.pyc
ngsutils/support/regions.py
ngsutils/support/stats.py
test-data/ngsutils_bam_filter_input1.bam
test-data/ngsutils_bam_filter_result1.bam
test-data/ngsutils_bam_filter_result2.bam
test-data/ngsutils_bam_filter_result3.bam
test-data/ngsutils_bam_filter_result4.bam
tool_dependencies.xml

diff -r 000000000000 -r 4e4e4093d65d bam_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bam_filter.xml Wed Nov 11 13:04:07 2015 -0500

[

b'@@ -0,0 +1,231 @@\n+<tool id="ngsutils_bam_filter" name="BAM filter" version="@WRAPPER_VERSION@.0">\n+ <description>Removes reads from a BAM file based on criteria</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <expand macro="requirements" />\n+ <expand macro="stdio" />\n+ <expand macro="version" />\n+ <command><![CDATA[\n+ ## If the tool is executed with no filtering option,\n+ ## the default parameters simply copy over the input file\n+ if grep -q "\\w" ${parameters};\n+ then\n+ $__tool_directory__/filter.py\n+ $infile\n+ $outfile\n+ `cat ${parameters}`;\n+ else\n+ cp $infile $outfile;\n+ fi\n+]]>\n+ </command>\n+ <configfiles>\n+ <configfile name="parameters">\n+<![CDATA[\n+ #if $minlen:\n+ -minlen $minlen\n+ #end if\n+ #if $maxlen\n+ -maxlen $maxlen\n+ #end if\n+ $mapped\n+ $unmapped\n+ $properpair\n+ $noproperpair\n+ #if $mask:\n+ -mask "${mask}"\n+ #end if\n+ #if int($uniq) > -1:\n+ -uniq\n+ #if int($uniq) > 0:\n+ $uniq\n+ #end if\n+ #end if\n+ $uniq_start\n+ #if $mismatch:\n+ -mismatch $mismatch\n+ #end if\n+ $nosecondary\n+ $noqcfail\n+ $nopcrdup\n+ #if $excludebed:\n+ -excludebed "${excludebed}" $ignore_strand\n+ #end if\n+ #if $includebed:\n+ -includebed "${includebed}" $ignore_strand\n+ #end if\n+ #if $includeref:\n+ -includeref "${includeref}"\n+ #end if\n+ #if $excluderef:\n+ -excluderef "${excluderef}"\n+ #end if\n+ #if $maximum_mismatch_ratio\n+ -maximum_mismatch_ratio $maximum_mismatch_ratio\n+ #end if\n+ ]]>\n+ </configfile>\n+ </configfiles>\n+ <inputs>\n+ <param name="infile" type="data" format="bam" label="Select BAM dataset" />\n+ <param argument="-minlen" type="integer" value="" optional="True" min="0"\n+ label="Remove reads that are smaller than"\n+ help="in bp"/>\n+ <param argument="-maxlen" type="integer" value="" optional="True" min="0"\n+ label="Remove reads that are larger than"\n+ help="in bp"/>\n+ <param argument="-mapped" truevalue="-mapped" type="boolean" falsevalue=""\n+ label="Keep only mapped reads"\n+ help="" />\n+ <param argument="-unmapped" truevalue="-unmapped" type="boolean" falsevalue=""\n+ label="Keep only unmapped reads"\n+ help="" />\n+ <param argument="-properpair" truevalue="-properpair" type="boolean" falsevalue=""\n+ label="Keep only properly paired reads"\n+ help="both mapped, correct orientation, flag set in BAM" />\n+ <param argument="-noproperpair" truevalue="-noproperpair" type="boolean" falsevalue=""\n+ label="Discard properly paired reads"\n+ help="" />\n+ <param argument="-mask" type="text" value="" optional="True" label="Remove reads that match the mask"\n+ help="e.g. 0x400, 0x2" />\n+\n+ <param argument="-uniq" type="integer" value="-1" optional="True" min="-1"\n+ label="Remove reads that have the same sequence"\n+ help="up to the Nth nucleotide starting from the 5prime end. -1 means do not filter, 0 means check along the entire read,\n+ 10 would make filter reads that are unique over the first 10 nucleotides."/>\n+\n+ <param argument="-uniq_start" truevalue="-uniq_start" type="boolean" falsevalue=""\n+ label="Remove reads that start at the same position"\n+ help="Use only for low-coverage samples" />\n+\n+ <param argument="-mismatch" type="integer" value="" optional="True" min="0"\n+ label="Remove reads with that many mismatches"\n+ help="Indels always counts as 1 regardless of length. Requires NM tag to b'..b'--------------+\n+| -maxlen val | Remove reads that are larger than {val} |\n++--------------------------------+-------------------------------------------------+\n+| -mapped | Keep only mapped reads |\n++--------------------------------+-------------------------------------------------+\n+| -unmapped | Keep only unmapped reads |\n++--------------------------------+-------------------------------------------------+\n+| -properpair | Keep only properly paired reads (both mapped, |\n+| | correct orientation, flag set in BAM) |\n++--------------------------------+-------------------------------------------------+\n+| -noproperpair | Keep only not-properly paired reads |\n++--------------------------------+-------------------------------------------------+\n+| -mask bitmask | Remove reads that match the mask (e.g. 0x400) |\n++--------------------------------+-------------------------------------------------+\n+| -uniq {length} | Remove reads that are have the same sequence |\n+| | Note: BAM file should be sorted |\n+| | (up to an optional length) |\n++--------------------------------+-------------------------------------------------+\n+| -uniq_start | Remove reads that start at the same position |\n+| | Note: BAM file should be sorted |\n+| | (Use only for low-coverage samples) |\n++--------------------------------+-------------------------------------------------+\n+|-mismatch num | Number of mismatches or indels |\n+| | indel always counts as 1 regardless of length |\n+| | (requires NM tag in reads) |\n++--------------------------------+-------------------------------------------------+\n+|-nosecondary | Remove reads that have the 0x100 flag set |\n++--------------------------------+-------------------------------------------------+\n+|-noqcfail | Remove reads that have the 0x200 flag set |\n++--------------------------------+-------------------------------------------------+\n+|-nopcrdup | Remove reads that have the 0x400 flag set |\n++--------------------------------+-------------------------------------------------+\n+|-excludebed file.bed {nostrand} | Remove reads that are in any of the regions |\n+| | from the given BED file. If \'nostrand\' is given,|\n+| | strand information from the BED file is ignored.|\n++--------------------------------+-------------------------------------------------+\n+|-includebed file.bed {nostrand} | Remove reads that are NOT any of the regions |\n+| | from the given BED file. If \'nostrand\' is given,|\n+| | strand information from the BED file is ignored.|\n+| | Note: If this is a large dataset, use |\n+| | "bamutils extract" instead. |\n++--------------------------------+-------------------------------------------------+\n+| -includeref refname | Exclude reads NOT mapped to a reference |\n++--------------------------------+-------------------------------------------------+\n+| -excluderef refname | Exclude reads mapped to a particular reference |\n+| | (e.g. chrM, or _dup chromosomes) |\n++--------------------------------+-------------------------------------------------+\n+]]>\n+ </help>\n+</tool>\n'

diff -r 000000000000 -r 4e4e4093d65d filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.py Wed Nov 11 13:04:07 2015 -0500

[

b'@@ -0,0 +1,986 @@\n+#!/usr/bin/env python\n+## category General\n+## desc Removes reads from a BAM file based on criteria\n+"""\n+Removes reads from a BAM file based on criteria\n+\n+Given a BAM file, this script will only allow reads that meet filtering\n+criteria to be written to output. The output is another BAM file with the\n+reads not matching the criteria removed.\n+\n+Note: this does not adjust tag values reflecting any filtering. (for example:\n+ if a read mapped to two locations (IH:i:2), and one was removed by\n+ filtering, the IH:i tag would still read IH:i:2).\n+\n+Currently, the available filters are:\n+ -minlen val Remove reads that are smaller than {val}\n+ -maxlen val Remove reads that are larger than {val}\n+ -mapped Keep only mapped reads\n+ -unmapped Keep only unmapped reads\n+ -properpair Keep only properly paired reads (both mapped, \n+ correct orientation, flag set in BAM)\n+ -noproperpair Keep only not-properly paired reads\n+\n+ -mask bitmask Remove reads that match the mask (base 10/hex)\n+ -uniq {length} Remove reads that are have the same sequence\n+ Note: BAM file should be sorted\n+ (up to an optional length)\n+ -uniq_start Remove reads that start at the same position\n+ Note: BAM file should be sorted\n+ (Use only for low-coverage samples)\n+\n+ -mismatch num # mismatches or indels\n+ indel always counts as 1 regardless of length\n+ (requires NM tag in reads)\n+\n+ -mismatch_dbsnp num dbsnp.txt.bgz\n+ # mismatches or indels - not in dbSNP.\n+ Variations are called using the MD tag.\n+ Variations that are found in the dbSNP list are\n+ not counted as mismatches. The dbSNP list is a\n+ Tabix-indexed dump of dbSNP (from UCSC Genome\n+ Browser). Indels in dbSNP are also counted.\n+ Adds a \'ZS:i\' tag with the number of found SNPs\n+ in the read.\n+ (requires NM and MD tags)\n+\n+ Example command for indexing:\n+ ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0\n+\n+ -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches\n+ directly in a reference FASTA file\n+ (use if NM tag not present)\n+\n+ -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz\n+ # mismatches or indels - looks up mismatches\n+ directly from a reference FASTA file. (See\n+ -mismatch_dbsnp for dbSNP matching)\n+ (use if NM or MD tag not present)\n+\n+ -nosecondary Remove reads that have the 0x100 flag set\n+ -noqcfail Remove reads that have the 0x200 flag set\n+ -nopcrdup Remove reads that have the 0x400 flag set\n+\n+\n+ -exclude ref:start-end Remove reads in this region (1-based start)\n+ -excludebed file.bed {nostrand}\n+ Remove reads that are in any of the regions\n+ from the given BED file. If \'nostrand\' is given,\n+ strand information from the BED file is ignored.\n+\n+ -include ref:start-end Remove reads NOT in the region (can only be one)\n+ -includebed file.bed {nostrand}\n+ Remove reads that are NOT any of the regions\n+ from the given BED file. If \'nostrand\' is given'..b' if failedfile:\n+ failed_out = open(failedfile, \'w\')\n+ else:\n+ failed_out = None\n+\n+ passed = 0\n+ failed = 0\n+\n+ def _callback(read):\n+ return "%s | %s kept,%s failed" % (\'%s:%s\' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else \'unk\', passed, failed)\n+\n+ for read in bam_iter(bamfile, quiet=True):\n+ p = True\n+\n+ for criterion in criteria:\n+ if not criterion.filter(bamfile, read):\n+ p = False\n+ failed += 1\n+ if failed_out:\n+ failed_out.write(\'%s\\t%s\\n\' % (read.qname, criterion))\n+ #outfile.write(read_to_unmapped(read))\n+ break\n+ if p:\n+ passed += 1\n+ outfile.write(read)\n+\n+ bamfile.close()\n+ outfile.close()\n+ if failed_out:\n+ failed_out.close()\n+ sys.stdout.write("%s kept\\n%s failed\\n" % (passed, failed))\n+\n+ for criterion in criteria:\n+ criterion.close()\n+\n+\n+def read_to_unmapped(read):\n+ \'\'\'\n+ Example unmapped read\n+ 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\\VNO^IIRJKXTIHIKTY\\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.\'>52%)\'15?90<7;:5+(-49(\'-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4\n+\n+ Example mapped read:\n+ 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\\I;"`R@D]``ZZ``\\QKX\\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2----------------------------\n+\n+ \'\'\'\n+ # TODO: rewrite read properties to unmapped state\n+ # if colorspace: convert CS to NT directly\n+ # remove excess tags\n+\n+ read.is_unmapped = True\n+ read.tid = None\n+ read.pos = 0\n+ read.mapq = 0\n+ return read\n+\n+if __name__ == \'__main__\':\n+ infile = None\n+ outfile = None\n+ failed = None\n+ criteria = []\n+\n+ crit_args = []\n+ last = None\n+ verbose = False\n+ fail = False\n+\n+ for arg in sys.argv[1:]:\n+ if last == \'-failed\':\n+ failed = arg\n+ last = None\n+ elif arg == \'-h\':\n+ usage()\n+ elif arg == \'-failed\':\n+ last = arg\n+ elif arg == \'-v\':\n+ verbose = True\n+ elif not infile and os.path.exists(arg):\n+ infile = arg\n+ elif not outfile:\n+ outfile = arg\n+ elif arg[0] == \'-\':\n+ if not arg[1:] in _criteria:\n+ print "Unknown criterion: %s" % arg\n+ fail = True\n+ if crit_args:\n+ criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))\n+ crit_args = [arg, ]\n+ elif crit_args:\n+ crit_args.append(arg)\n+ else:\n+ print "Unknown argument: %s" % arg\n+ fail = True\n+\n+ if not fail and crit_args:\n+ criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))\n+\n+ if fail or not infile or not outfile or not criteria:\n+ if not infile and not outfile and not criteria:\n+ usage()\n+\n+ if not infile:\n+ print "Missing: input bamfile"\n+ if not outfile:\n+ print "Missing: output bamfile"\n+ if not criteria:\n+ print "Missing: filtering criteria"\n+ usage()\n+ else:\n+ bam_filter(infile, outfile, criteria, failed, verbose)\n'

diff -r 000000000000 -r 4e4e4093d65d macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Wed Nov 11 13:04:07 2015 -0500

@@ -0,0 +1,16 @@
+<macros>
+    <token name="@WRAPPER_VERSION@">0.5.7</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.7.7">pysam</requirement>
+        </requirements>
+    </xml>
+    <xml name="version">
+        <version_command>@WRAPPER_VERSION@</version_command>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:" level="fatal" />
+        </stdio>
+    </xml>
+</macros>

diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.pyc

Binary file ngsutils/__init__.pyc has changed

diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bam/__init__.py Wed Nov 11 13:04:07 2015 -0500

[

b"@@ -0,0 +1,879 @@\n+import sys\n+import os\n+import re\n+import pysam\n+try:\n+ from eta import ETA\n+except:\n+ pass\n+import ngsutils.support\n+\n+\n+def bam_open(fname, mode='r', *args, **kwargs):\n+ if fname.lower()[-4:] == '.bam':\n+ return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)\n+ return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)\n+\n+\n+def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):\n+ if not quiet and bam.filename:\n+ eta = ETA(os.stat(bam.filename).st_size)\n+ else:\n+ eta = None\n+\n+ for pileup in bam.pileup(mask=mask):\n+ pos = bam.tell()\n+ bgz_offset = pos >> 16\n+\n+ if not quiet:\n+ if callback:\n+ eta.print_status(bgz_offset, extra=callback(pileup))\n+ else:\n+ eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))\n+\n+ yield pileup\n+\n+ if eta:\n+ eta.done()\n+\n+\n+def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):\n+ '''\n+ >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]\n+ ['A', 'B', 'E', 'C', 'D', 'F', 'Z']\n+ '''\n+\n+ if os.path.exists('%s.bai' % bam.filename):\n+ # This is an indexed file, so it is ref sorted...\n+ # Meaning that we should show chrom:pos, instead of read names\n+ show_ref_pos = True\n+\n+ eta = None\n+\n+ if not ref:\n+ if not quiet and bam.filename:\n+ eta = ETA(os.stat(bam.filename).st_size)\n+\n+ for read in bam:\n+ pos = bam.tell()\n+ bgz_offset = pos >> 16\n+\n+ if not quiet and eta:\n+ if callback:\n+ eta.print_status(bgz_offset, extra=callback(read))\n+ elif (show_ref_pos):\n+ if read.tid > -1:\n+ eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n+ else:\n+ eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))\n+ else:\n+ eta.print_status(bgz_offset, extra='%s' % read.qname)\n+\n+ yield read\n+\n+ else:\n+ working_chrom = None\n+ if ref in bam.references:\n+ working_chrom = ref\n+ elif ref[0:3] == 'chr':\n+ # compensate for Ensembl vs UCSC ref naming\n+ if ref[3:] in bam.references:\n+ working_chrom = ref[3:]\n+\n+ if not working_chrom:\n+ raise ValueError('Missing reference: %s' % ref)\n+\n+ tid = bam.gettid(working_chrom)\n+\n+ if not start:\n+ start = 0\n+ if not end:\n+ end = bam.lengths[tid]\n+\n+ if not quiet and bam.filename:\n+ eta = ETA(end - start)\n+\n+ for read in bam.fetch(working_chrom, start, end):\n+ if not quiet and eta:\n+ if callback:\n+ eta.print_status(read.pos - start, extra=callback(read))\n+ else:\n+ eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))\n+\n+ yield read\n+\n+ if eta:\n+ eta.done()\n+\n+\n+def bam_batch_reads(bam, quiet=False):\n+ '''\n+ Batch mapping for the same reads (qname) together, this way\n+ they can all be compared/converted together.\n+ '''\n+ reads = []\n+ last = None\n+ for read in bam_iter(bam, quiet=quiet):\n+ if last and read.qname != last:\n+ yield reads\n+ reads = []\n+ last = read.qname\n+ reads.append(read)\n+\n+ if reads:\n+ yield reads\n+\n+\n+bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']\n+bam_cigar_op = {\n+ 'M': 0,\n+ 'I': 1,\n+ 'D': 2,\n+ 'N': 3,\n+ 'S': 4,\n+ 'H': 5,\n+ 'P': 6,\n+ '=': 7,\n+ 'X': 8,\n+}\n+\n+\n+def cigar_fromstr(s):\n+ '''\n+ >>> cigar_fromstr('10M5I20M')\n+ [(0, 10), (1, 5), (0,"..b'n True, \'\'\n+\n+\n+def read_alignment_fragments_gen(read):\n+ \'\'\'\n+ Takes a read and returns the start and end positions for each match/mismatch region.\n+ This will let us know where each read alignment "touches" the genome.\n+ \'\'\'\n+\n+ for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):\n+ yield (start, end)\n+\n+\n+def _read_alignment_fragments_gen(pos, cigar):\n+ \'\'\' Test-able version of read_alignment_fragments_gen\n+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'50M\')))\n+ [(1, 51)]\n+\n+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'25M100N25M\')))\n+ [(1, 26), (126, 151)]\n+\n+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr(\'20M1D4M100N10M5I10M\')))\n+ [(1, 21), (22, 26), (126, 136), (136, 146)]\n+ \'\'\'\n+ ref_pos = pos\n+\n+ for op, length in cigar:\n+ if op == 0:\n+ yield (ref_pos, ref_pos + length)\n+ ref_pos += length\n+ elif op == 1:\n+ pass\n+ elif op == 2:\n+ ref_pos += length\n+ elif op == 3:\n+ ref_pos += length\n+ else:\n+ raise ValueError("Unsupported CIGAR operation: %s" % op)\n+\n+\n+def read_cigar_at_pos(cigar, qpos, is_del):\n+ \'\'\'\n+ Returns the CIGAR operation for a given read position\n+\n+ qpos is the 0-based index of the base in the read\n+ \'\'\'\n+ pos = 0\n+ returnnext = False\n+ for op, length in cigar:\n+ if returnnext:\n+ return op\n+ if op == 0:\n+ pos += length\n+ elif op == 1:\n+ pos += length\n+ elif op == 2:\n+ pass\n+ elif op == 3:\n+ pass\n+ elif op == 4:\n+ pos += length\n+ elif op == 5:\n+ pass\n+ else:\n+ raise ValueError("Unsupported CIGAR operation: %s" % op)\n+\n+ if pos > qpos:\n+ return op\n+ if is_del and pos == qpos:\n+ returnnext = True\n+\n+ return None\n+\n+\n+def cleancigar(cigar):\n+ \'\'\'\n+ Cleans a CIGAR alignment to remove zero length operations\n+\n+ >>> cigar_tostr(cleancigar(cigar_fromstr(\'31M0N52M18S\')))\n+ \'83M18S\'\n+\n+ \'\'\'\n+ newcigar = []\n+\n+ changed = False\n+ zero = False\n+\n+ for op, size in cigar:\n+ if size > 0:\n+ if zero and newcigar:\n+ if newcigar[-1][0] == op:\n+ newsize = newcigar[-1][1] + size\n+ newcigar[-1] = (op, newsize)\n+ zero = 0\n+ else:\n+ newcigar.append((op, size))\n+ else:\n+ changed = True\n+ zero = True\n+\n+ if changed:\n+ return newcigar\n+\n+ return None\n+\n+\n+def read_cleancigar(read):\n+ \'\'\'\n+ Replaces the CIGAR string for a read to remove any operations that are zero length.\n+\n+ This happens to the read in-place\n+ \'\'\'\n+ if read.is_unmapped:\n+ return False\n+\n+ newcigar = []\n+\n+ newcigar = cleancigar(read.cigar)\n+\n+ if newcigar:\n+ read.cigar = newcigar\n+ return True\n+\n+ return False\n+\n+\n+def read_to_unmapped(read, ref=None):\n+ \'\'\'\n+ Converts a read from mapped to unmapped.\n+\n+ Sets the \'ZR\' tag to indicate the original ref/pos/cigar (if ref is passed)\n+ \'\'\'\n+\n+ newread = pysam.AlignedRead()\n+\n+ if ref:\n+ tags = [(\'ZR\', \'%s:%s:%s\' % (ref, read.pos, cigar_tostr(read.cigar)))]\n+\n+ newread.is_unmapped = True\n+ newread.mapq = 0\n+ newread.tlen = 0\n+ newread.pos = -1\n+ newread.pnext = -1\n+ newread.rnext = -1\n+ newread.tid = -1\n+\n+ newread.qname = read.qname\n+\n+ if read.is_paired:\n+ newread.is_paired = True\n+\n+ if not read.is_unmapped and read.is_reverse:\n+ newread.seq = ngsutils.support.revcomp(read.seq)\n+ newread.qual = read.qual[::-1]\n+ else: \n+ newread.seq = read.seq\n+ newread.qual = read.qual\n+\n+ newread.tags = tags\n+\n+ return newread\n+\n+\n+\n+if __name__ == \'__main__\':\n+ import doctest\n+ doctest.testmod()\n'

diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.pyc

Binary file ngsutils/bam/__init__.pyc has changed

diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bed/__init__.py Wed Nov 11 13:04:07 2015 -0500

[

b'@@ -0,0 +1,282 @@\n+import os\n+import ngsutils.support.ngs_utils\n+import pysam\n+\n+\n+class BedStreamer(object):\n+ \'\'\'\n+ Streams BedRegions from a BED file\n+\n+ Note - this can only be used once! There is no mechanism to seek the stream.\n+ \'\'\'\n+\n+ def __init__(self, fname=None, fileobj=None, quiet=False):\n+ if not fname and not fileobj:\n+ raise ValueError("You must specify either fname or fileobj!")\n+\n+ self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)\n+\n+ def __iter__(self):\n+ return self\n+\n+ def next(self):\n+ try:\n+ while True:\n+ line = self.reader.next().strip()\n+ if line and line[0] != \'#\':\n+ cols = line.split(\'\\t\')\n+ while len(cols) < 6:\n+ cols.append(\'\')\n+\n+ return BedRegion(*cols)\n+ except:\n+ raise StopIteration\n+\n+\n+\n+class BedFile(object):\n+ \'\'\'\n+ BED files are read in their entirety memory, in a series of bins. Each bin\n+ is ~100kb in size. Each bin can then be iterated over.\n+\n+ This is less efficient than using a proper index, but in reality, this\n+ usually isn\'t an issue. However, if the BED file has been Tabix indexed,\n+ that index will be used for random access.\n+\n+ NOTE: This isn\'t very efficient, so perhaps this can be remodeled into a BedFile\n+ and a BedFileIndex where the file is indexed only if random access is requested.\n+ \'\'\'\n+\n+ _bin_const = 100000\n+\n+ def __init__(self, fname=None, fileobj=None, region=None):\n+ self._bins = {}\n+ self._bin_list = []\n+ self._cur_bin_idx = 0\n+ self._cur_bin_pos = 0\n+ self._tellpos = 0\n+ self._total = 0\n+ self._length = 0\n+ self.__tabix = None\n+\n+ self.filename = fname\n+\n+ if os.path.exists(\'%s.tbi\' % fname):\n+ self.__tabix = pysam.Tabixfile(fname)\n+\n+ if fileobj:\n+ self.__readfile(fileobj)\n+ elif fname:\n+ with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:\n+ self.__readfile(fobj)\n+ elif region:\n+ chrom, startend = region.split(\':\')\n+ if \'-\' in startend:\n+ start, end = [int(x) for x in startend.split(\'-\')]\n+ else:\n+ start = int(startend)\n+ end = start\n+ start -= 1\n+\n+ self.__add_region(BedRegion(chrom, start, end))\n+ else:\n+ raise ValueError("Must specify either filename, fileobj, or region")\n+\n+ def __readfile(self, fobj):\n+ for line in fobj:\n+ line = line.strip()\n+ if line and line[0] != \'#\':\n+ cols = line.split(\'\\t\')\n+ while len(cols) < 6:\n+ cols.append(\'\')\n+\n+ region = BedRegion(*cols)\n+ self.__add_region(region)\n+\n+ self._bin_list.sort()\n+ for bin in self._bins:\n+ self._bins[bin].sort()\n+\n+ def __add_region(self, region):\n+ self._total += region.end - region.start\n+ self._length += 1\n+\n+ startbin = region.start / BedFile._bin_const\n+ endbin = region.end / BedFile._bin_const\n+\n+ for bin in xrange(startbin, endbin + 1):\n+ if not (region.chrom, bin) in self._bins:\n+ self._bin_list.append((region.chrom, bin))\n+ self._bins[(region.chrom, bin)] = []\n+ self._bins[(region.chrom, bin)].append(region)\n+\n+ def fetch(self, chrom, start, end, strand=None):\n+ \'\'\'\n+ For TABIX indexed BED files, find all regions w/in a range\n+\n+ For non-TABIX index BED files, use the calculated bins, and\n+ output matching regions\n+ \'\'\'\n+\n+ if self.__tabix:\n+ for match in self.__tabix.fetch(chrom, start, end):\n+ region = BedRegion(*match.split(\'\\t\'))\n+ if not strand or (strand'..b"\n+\n+ def close(self):\n+ pass\n+\n+ @property\n+ def length(self):\n+ return self._length\n+\n+ @property\n+ def total(self):\n+ return self._total\n+\n+ def __iter__(self):\n+ self._cur_bin_idx = 0\n+ self._cur_bin_pos = 0\n+ self._tellpos = 0\n+ return self\n+\n+ def next(self):\n+ if self._cur_bin_idx >= len(self._bin_list):\n+ raise StopIteration\n+\n+ binvals = self._bins[self._bin_list[self._cur_bin_idx]]\n+ while self._cur_bin_pos < len(binvals):\n+ val = binvals[self._cur_bin_pos]\n+ self._cur_bin_pos += 1\n+\n+ startbin = (val.chrom, val.start / BedFile._bin_const)\n+ if startbin == self._bin_list[self._cur_bin_idx]:\n+ self._tellpos += 1\n+ return val\n+\n+ self._cur_bin_idx += 1\n+ self._cur_bin_pos = 0\n+ return self.next()\n+\n+\n+class BedRegion(object):\n+ def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):\n+ self.chrom = chrom\n+ self.start = int(start)\n+ self.end = int(end)\n+ self.name = name\n+\n+ if score == '':\n+ self.score = 0\n+ else:\n+ self.score = float(score)\n+\n+ if strand == '':\n+ self.strand = None\n+ else:\n+ self.strand = strand\n+\n+ if thickStart == '':\n+ self.thickStart = None\n+ else:\n+ self.thickStart = thickStart\n+\n+ if thickEnd == '':\n+ self.thickEnd = None\n+ else:\n+ self.thickEnd = thickEnd\n+\n+ if rgb == '':\n+ self.rgb = None\n+ else:\n+ self.rgb = rgb\n+\n+ self.extras = args\n+\n+ def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):\n+ cols = []\n+ cols.append(self.chrom if chrom is None else chrom)\n+ cols.append(self.start if start is None else start)\n+ cols.append(self.end if end is None else end)\n+ cols.append(self.name if name is None else name)\n+ cols.append(self.score if score is None else score)\n+ cols.append(self.strand if strand is None else strand)\n+ cols.append(self.thickStart if thickStart is None else thickStart)\n+ cols.append(self.thickEnd if thickEnd is None else thickEnd)\n+ cols.append(self.rgb if rgb is None else rgb)\n+\n+ for i, val in enumerate(self.extras):\n+ if len(args) > i:\n+ cols.append(args[i])\n+ else:\n+ cols.append(val)\n+\n+ return BedRegion(*cols)\n+\n+ @property\n+ def score_int(self):\n+ score = str(self.score)\n+ if score[-2:] == '.0':\n+ score = score[:-2]\n+\n+ return score\n+\n+ def __key(self):\n+ return (self.chrom, self.start, self.end, self.strand, self.name)\n+\n+ def __lt__(self, other):\n+ return self.__key() < other.__key()\n+\n+ def __gt__(self, other):\n+ return self.__key() > other.__key()\n+\n+ def __eq__(self, other):\n+ return self.__key() == other.__key()\n+\n+ def write(self, out):\n+ out.write('%s\\n' % self)\n+\n+ def __repr__(self):\n+ outcols = []\n+\n+ if self.rgb:\n+ outcols.append(self.rgb)\n+ if self.thickEnd or outcols:\n+ outcols.append(self.thickEnd if self.thickEnd else self.end)\n+ if self.thickStart or outcols:\n+ outcols.append(self.thickStart if self.thickStart else self.start)\n+ if self.strand or outcols:\n+ outcols.append(self.strand)\n+ if self.score_int != '' or outcols:\n+ outcols.append(self.score_int)\n+ if self.name or outcols:\n+ outcols.append(self.name)\n+\n+ outcols.append(self.end)\n+ outcols.append(self.start)\n+ outcols.append(self.chrom)\n+\n+ return '\\t'. join([str(x) for x in outcols[::-1]])\n"

diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.pyc

Binary file ngsutils/bed/__init__.pyc has changed

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/__init__.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,249 @@
+import collections
+import gzip
+import os
+import sys
+import re
+try:
+    from eta import ETA
+except:
+    pass
+
+class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
+    def __repr__(self):
+        if self.comment:
+            return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
+        return '>%s\n%s\n' % (self.name, self.seq)
+
+    def subseq(self, start, end, comment=None):
+        if self.comment:
+            comment = '%s %s' % (self.comment, comment)
+
+        return FASTARead(self.name, comment, self.seq[start:end])
+
+    def clone(self, name=None, comment=None, seq=None):
+        n = name if name else self.name
+        c = comment if comment else self.comment
+        s = seq if seq else self.seq
+
+        return FASTARead(n, c, s)
+
+    def write(self, out):
+        out.write(repr(self))
+
+
+class FASTA(object):
+    def __init__(self, fname=None, fileobj=None, qual=False):
+        self.fname = fname
+        self.qual = qual
+        if fileobj:
+            self.fileobj = fileobj
+        else:
+            if self.fname == '-':
+                self.fileobj = sys.stdin
+            elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
+                self.fileobj = gzip.open(os.path.expanduser(self.fname))
+            else:
+                self.fileobj = open(os.path.expanduser(self.fname))
+
+        if not self.fileobj:
+            raise ValueError("Missing valid filename or fileobj")
+
+    def close(self):
+        if self.fileobj != sys.stdout:
+            self.fileobj.close()
+
+    def tell(self):
+        # always relative to uncompressed...
+        return self.fileobj.tell()
+
+    def seek(self, pos, whence=0):
+        self.fileobj.seek(pos, whence)
+
+    def fetch(self, quiet=False):
+        name = ''
+        comment = ''
+        seq = ''
+
+        if not quiet and self.fname and self.fname != '-':
+            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
+        else:
+            eta = None
+
+        for line in self.fileobj:
+            line = line.strip()
+            if not line:
+                continue
+            if line[0] == '#':
+                continue
+
+            if line[0] == '>':
+                if name and seq:
+                    if eta:
+                        eta.print_status(extra=name)
+                    yield FASTARead(name, comment, seq)
+
+                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
+                name = spl[0]
+                if len(spl) > 1:
+                    comment = spl[1]
+                else:
+                    comment = ''
+                seq = ''
+
+            else:
+                if self.qual:
+                    seq = seq + ' ' + line
+                else:
+                    seq += line
+
+        if name and seq:
+            if eta:
+                eta.print_status(extra=name)
+            yield FASTARead(name, comment, seq)
+
+        if eta:
+            eta.done()
+
+
+def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
+    if fileobj:
+        f = fileobj
+    elif fname == '-':
+        f = sys.stdin
+    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+        f = gzip.open(os.path.expanduser(fname))
+    else:
+        f = open(os.path.expanduser(fname))
+
+    if quiet or fname == '-':
+        eta = None
+    else:
+        eta = ETA(os.stat(fname).st_size, fileobj=f)
+
+    for line in f:
+        if eta:
+            if callback:
+                extra = callback()
+            else:
+                extra = ''
+
+            eta.print_status(extra=extra)
+        yield line
+
+        if done_callback and done_callback():
+                break
+
+    if f != sys.stdin:
+        f.close()
+
+    if eta:
+        eta.done()
+
+
+class Symbolize(object):
+    'Converts strings to symbols - basically a cache of strings'
+    def __init__(self):
+        self.__cache = {}
+
+    def __getitem__(self, k):
+        if not k in self.__cache:
+            self.__cache[k] = k
+
+        return self.__cache[k]
+
+symbols = Symbolize()
+
+_compliments = {
+'a': 't',
+'A': 'T',
+'c': 'g',
+'C': 'G',
+'g': 'c',
+'G': 'C',
+'t': 'a',
+'T': 'A',
+'n': 'n',
+'N': 'N'
+}
+
+
+def revcomp(seq):
+    '''
+    >>> revcomp('ATCGatcg')
+    'cgatCGAT'
+    '''
+    ret = []
+
+    for s in seq:
+        ret.append(_compliments[s])
+
+    ret.reverse()
+    return ''.join(ret)
+
+
+class Counts(object):
+    '''
+    Setup simple binning.  Bins are continuous 0->max.  Values are added to
+    bins and then means / distributions can be calculated.
+    '''
+    def __init__(self):
+        self.bins = []
+
+    def add(self, val):
+        while len(self.bins) <= val:
+            self.bins.append(0)
+        self.bins[val] += 1
+
+    def mean(self):
+        acc = 0
+        count = 0
+
+        for i, val in enumerate(self.bins):
+            acc += (i * val)
+            count += val
+
+        if count > 0:
+            return float(acc) / count
+
+    def max(self):
+        return len(self.bins) - 1
+
+
+def memoize(func):
+    if 'TESTING' in os.environ or 'DEBUG' in os.environ:
+        return func
+
+    __cache = {}
+    def inner(*args, **kwargs):
+        k = (args, tuple(kwargs.iteritems()))
+        if  k not in __cache:
+            __cache[k] = func(*args, **kwargs)
+        return __cache[k]
+
+    inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
+    return inner
+
+
+def quoted_split(s, delim, quote_char='"'):
+    tokens = []
+
+    buf = ""
+    inquote = False
+
+    for c in s:
+        if inquote:
+            buf += c
+            if c == quote_char:
+                inquote = False
+        elif c == delim:
+            tokens.append(buf)
+            buf = ""
+        else:
+            buf += c
+            if c == quote_char:
+                inquote = True
+
+    if buf:
+        tokens.append(buf)
+
+    return tokens

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.pyc

Binary file ngsutils/support/__init__.pyc has changed

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/bgzip.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/bgzip.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+'''
+Extract the blocks from a BGZip file.
+
+BAM files are stored as blocks in a bgzip archive. This class
+will load the bgzip archive and output the block information.
+'''
+
+import sys
+import os
+import struct
+
+
+class BGZip(object):
+    def __init__(self, fname):
+        self.fname = fname
+        self.pos = 0
+        self.fileobj = open(self.fname)
+        self.fsize = os.stat(self.fname).st_size
+        self.cur_chunk = 0
+
+        self.cpos = 0
+        self.cdata = 0
+
+    def close(self):
+        self.fileobj.close()
+
+    def seek(self, offset):
+        bgz_offset = offset >> 16
+        chunk_offset = offset & 0xFFFF
+
+        self.fileobj.seek(bgz_offset)
+        self.read_chunk()
+        self.chunk_pos = chunk_offset
+
+    def read(self, amount, whence=0):
+        if whence not in [0, 1]:
+            print "Bad Whence value!: %s" % whence
+            return
+
+        if whence == 0:
+            self.seek(0, 0)
+
+        ### read into chunk, if not enough data in chunk, read next chunk
+        ret = ''
+        while amount and self.pos < self.fsize:
+            if len(self.cdata) - self.cpos < amount:
+                ret += self.cdata[self.cpos:self.cpos + amount]
+                self.cpos += amount
+                return ret
+            else:
+                ret += self.cdata[self.cpos:]
+                amount = amount - len(ret)
+                self.read_chunk()
+        return ret
+
+    def read_chunk(self):
+        self.fileobj.seek(10)
+        id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
+        subpos = 0
+        bsize = 0
+
+        while subpos < xlen:
+            si1, si2, slen = self._read_fields('<BBH')
+            if si1 == 66 and si2 == 67:
+                bsize, = self._read_fields('<H')
+            else:
+                self.fileobj.seek(slen, 1)
+                self.pos += slen
+
+            subpos += 6 + slen
+
+        cdata_size = bsize - xlen - 19
+        self.cdata = self.fileobj.read(cdata_size)  # inflate value
+        self.fileobj.seek(8)
+
+        self.cur_chunk += 1
+        self.cpos = 0
+
+    def dump(self):
+        self.fileobj.seek(0)
+        block_num = 0
+
+        while self.pos < self.fsize:
+            print "[BLOCK %s]" % block_num
+            # read header
+            id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('<BBBBIBBH')
+            print 'id1: %s' % id1
+            print 'id2: %s' % id2
+            print 'cm: %s' % cm
+            print 'flg: %s' % flg
+            print 'mtime: %s' % mtime
+            print 'xfl: %s' % xfl
+            print 'os: %s' % os
+            print 'xlen: %s' % xlen
+
+            # read subfields
+            subpos = 0
+            bsize = 0
+
+            while subpos < xlen:
+                si1, si2, slen = self._read_fields('<BBH')
+                print '    si1: %s' % si1
+                print '    si1: %s' % si2
+                print '    slen: %s' % slen
+                print '    data: [%s]' % slen
+
+                if si1 == 66 and si2 == 67:
+                    bsize, = self._read_fields('<H')
+                else:
+                    self.fileobj.seek(slen, 1)
+                    self.pos += slen
+
+                subpos += 6 + slen
+
+            cdata_size = bsize - xlen - 19
+
+            print 'bsize: %s' % bsize
+            print 'cdata: [%s]' % (cdata_size)
+
+            self.fileobj.seek(cdata_size, 1)
+            self.pos += cdata_size
+            crc, isize = self._read_fields('<II')
+
+            print "crc: %s" % crc
+            print "isize: %s" % isize
+            # print "POS: %s" % self.pos
+
+            block_num += 1
+
+    def _read_fields(self, field_types):
+        size = struct.calcsize(field_types)
+        self.pos += size
+        return struct.unpack(field_types, self.fileobj.read(size))
+
+if __name__ == '__main__':
+    print BGZip(sys.argv[1]).dump()

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/dbsnp.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,141 @@
+'''
+Support package for processing a dbSNP tabix dump from UCSC.
+'''
+
+import pysam
+import collections
+import sys
+from ngsutils.support import revcomp
+
+
+class SNPRecord(collections.namedtuple('SNPRecord', '''bin
+chrom
+chromStart
+chromEnd
+name
+score
+strand
+refNCBI
+refUCSC
+observed
+molType
+clazz
+valid
+avHet
+avHetSE
+func
+locType
+weight
+exceptions
+submitterCount
+submitters
+alleleFreqCount
+alleles
+alleleNs
+alleleFreqs
+bitfields
+''')):
+    __slots__ = ()
+
+    @property
+    def alleles(self):
+        alts = []
+        for alt in self.observed.split('/'):
+            if alt != '-' and self.strand == '-':
+                alt = revcomp(alt)
+
+            alts.append(alt)
+
+        return alts
+
+    @property
+    def snp_length(self):
+        return self.chromEnd - self.chromStart
+
+
+def autotype(ar, length=26):
+    out = []
+    for el in ar:
+        val = None
+        try:
+            val = int(el)
+        except:
+            try:
+                val = float(el)
+            except:
+                val = el
+
+        out.append(val)
+
+    if len(out) < 12:
+        raise TypeError("Invalid dbSNP file! We need at least 12 columns to work with.")
+
+    while len(out) < length:
+        out.append(None)
+    return out
+
+
+class DBSNP(object):
+    def __init__(self, fname):
+        self.dbsnp = pysam.Tabixfile(fname)
+        self.asTup = pysam.asTuple()
+
+    def fetch(self, chrom, pos):
+        'Note: pos is 0-based'
+
+        # Note: tabix the command uses 1-based positions, but
+        #       pysam.Tabixfile uses 0-based positions
+
+        for tup in self.dbsnp.fetch(chrom, pos, pos + 1, parser=self.asTup):
+            snp = SNPRecord._make(autotype(tup))
+            if snp.chromStart == pos:
+                yield snp
+
+    def close(self):
+        self.dbsnp.close()
+
+    def dump(self, chrom, op, pos, base, snp, exit=True):
+        print
+        print ' ->', op, chrom, pos, base
+        print snp
+        print snp.alleles
+        if exit:
+            sys.exit(1)
+
+    def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
+        for snp in self.fetch(chrom, pos):
+            if not '/' in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
+                # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
+                continue
+
+            if op == 0:
+                if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
+                    return True
+                # elif verbose:
+                #     for alt in snp.alleles:
+                #         if len(alt) > 1:
+                #             self.dump(chrom, op, pos, seq, snp, False)
+
+            elif op == 1:
+                if snp.clazz in ['insertion', 'mixed', 'in-del']:
+                    if seq in snp.alleles:
+                        if len(seq) > 1 and verbose:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        return True
+
+                    if verbose:
+                        if len(seq) > 1:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        else:
+                            for alt in snp.alleles:
+                                if len(alt) > 1:
+                                    self.dump(chrom, op, pos, seq, snp, False)
+
+            elif op == 2:
+                if snp.clazz in ['deletion', 'mixed', 'in-del']:
+                    if '-' in snp.alleles and seq in snp.alleles:
+                        if len(seq) > 1 and verbose:
+                            self.dump(chrom, op, pos, seq, snp, False)
+                        return True
+
+        return False

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.pyc

Binary file ngsutils/support/dbsnp.pyc has changed

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/llh.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/llh.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,55 @@
+'''
+Methods for calculating log-likelihoods for nucleotide frequencies
+'''
+import math
+import collections
+from ngsutils.support import memoize
+
+_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
+
+NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
+
+@memoize
+def pseudo_count(N, bg):
+    '''
+    >>> pseudo_count(100, _default_background['A'])
+    3
+    >>> pseudo_count(100, _default_background['C'])
+    2
+    '''
+
+    return bg * math.sqrt(N)
+
+
+def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
+    if pseudo == 'auto':
+        N = A + C + G + T
+        Ap = float(A) + pseudo_count(N, bg['A'])
+        Cp = float(C) + pseudo_count(N, bg['C'])
+        Gp = float(G) + pseudo_count(N, bg['G'])
+        Tp = float(T) + pseudo_count(N, bg['T'])
+    elif pseudo:
+        Ap = float(A) + pseudo
+        Cp = float(C) + pseudo
+        Gp = float(G) + pseudo
+        Tp = float(T) + pseudo
+    else:
+        Ap = float(A)
+        Cp = float(C)
+        Gp = float(G)
+        Tp = float(T)
+
+    Np = Ap + Cp + Gp + Tp
+
+    freqA = Ap / Np
+    freqC = Cp / Np
+    freqG = Gp / Np
+    freqT = Tp / Np
+
+    return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
+
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/ngs_utils.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+"""
+
+Common util classes / functions for the NGS project
+
+"""
+import sys
+import os
+import gzip
+import re
+import collections
+
+
+def format_number(n):
+    '''
+    >>> format_number(1000)
+    '1,000'
+    >>> format_number(1234567)
+    '1,234,567'
+    '''
+    ar = list(str(n))
+    for i in range(len(ar))[::-3][1:]:
+        ar.insert(i + 1, ',')
+    return ''.join(ar)
+
+
+def natural_sort(ar):
+    '''
+    >>> natural_sort('1 3 4 2 5'.split())
+    ['1', '2', '3', '4', '5']
+    >>> natural_sort('1 10 20 2 3 4'.split())
+    ['1', '2', '3', '4', '10', '20']
+    '''
+    to_sort = []
+    for item in ar:
+        spl = re.split('(\d+)', item)
+        l2 = []
+        for el in spl:
+            try:
+                n = int(el)
+            except:
+                n = el
+            l2.append(n)
+        to_sort.append((l2, item))
+
+    to_sort.sort()
+    return [x[1] for x in to_sort]
+
+
+def dictify(values, colnames):
+    """
+    Convert a list of values into a dictionary based upon given column names.
+
+    If the column name starts with an '@', the value is assumed to be a comma
+    separated list.
+
+    If the name starts with a '#', the value is assumed to be an int.
+
+    If the name starts with '@#', the value is assumed to  a comma separated
+    list of ints.
+
+    """
+    d = {}
+    for i in xrange(len(colnames)):
+        key = colnames[i]
+        split = False
+        num = False
+
+        if key[0] == '@':
+            key = key[1:]
+            split = True
+        if key[0] == '#':
+            key = key[1:]
+            num = True
+
+        if i < len(values):
+            if num and split:
+                val = [int(x) for x in values[i].rstrip(',').split(',')]
+            elif num:
+                val = int(values[i])
+            elif split:
+                val = values[i].rstrip(',').split(',')
+            else:
+                val = values[i]
+
+            d[key] = val
+
+        else:
+            d[key] = None
+
+    return d
+
+
+def gzip_aware_open(fname):
+    if fname == '-':
+        f = sys.stdin
+    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+        f = gzip.open(os.path.expanduser(fname))
+    else:
+        f = open(os.path.expanduser(fname))
+    return f
+
+
+class gzip_opener:
+    '''
+    A Python 2.6 class to handle 'with' opening of text files that may
+    or may not be gzip compressed.
+    '''
+    def __init__(self, fname):
+        self.fname = fname
+
+    def __enter__(self):
+        self.f = gzip_aware_open(self.fname)
+        return self.f
+
+    def __exit__(self, type, value, traceback):
+        if self.f != sys.stdin:
+            self.f.close()
+        return False
+
+
+def filenames_to_uniq(names, new_delim='.'):
+    '''
+    Given a set of file names, produce a list of names consisting of the
+    uniq parts of the names. This works from the end of the name.  Chunks of
+    the name are split on '.' and '-'.
+
+    For example:
+        A.foo.bar.txt
+        B.foo.bar.txt
+        returns: ['A','B']
+
+        AA.BB.foo.txt
+        CC.foo.txt
+        returns: ['AA.BB','CC']
+
+    >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
+    ['a', 'b']
+    >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
+    ['a.b', 'c']
+
+    '''
+    name_words = []
+    maxlen = 0
+    for name in names:
+        name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
+        name_words[-1].reverse()
+        if len(name_words[-1]) > maxlen:
+            maxlen = len(name_words[-1])
+
+    common = [False, ] * maxlen
+    for i in xrange(maxlen):
+        last = None
+        same = True
+        for nameword in name_words:
+            if i >= len(nameword):
+                same = False
+                break
+            if not last:
+                last = nameword[i]
+            elif nameword[i] != last:
+                same = False
+                break
+        common[i] = same
+
+    newnames = []
+    for nameword in name_words:
+        nn = []
+        for (i, val) in enumerate(common):
+            if not val and i < len(nameword):
+                nn.append(nameword[i])
+        nn.reverse()
+        newnames.append(new_delim.join(nn))
+    return newnames
+
+
+def parse_args(argv, defaults=None, expected_argc=0):
+    opts = {}
+    if defaults:
+        opts.update(defaults)
+
+    args = []
+
+    i = 0
+    while i < len(argv):
+        if argv[i][0] == '-':
+            arg = argv[i].lstrip('-')
+            if '=' in arg:
+                k, v = arg.split('=', 2)
+                if k in defaults:
+                    if type(defaults[k]) == float:
+                        opts[k] = float(v)
+                    elif type(defaults[k]) == int:
+                        opts[k] = int(v)
+                    else:
+                        opts[k] = v
+            else:
+                opts[arg] = True
+        else:
+            args.append(argv[i])
+        i += 1
+
+    while len(args) < expected_argc:
+        args.append(None)
+    return opts, args
+
+
+class memoize(object):
+    'Simple memoizing decorator to cache results'
+    def __init__(self, func):
+        self.func = func
+        self.cache = {}
+
+    def __call__(self, *args):
+        if not isinstance(args, collections.Hashable):
+            # uncacheable. a list, for instance.
+            # better to not cache than blow up.
+            return self.func(*args)
+
+        if args in self.cache:
+            return self.cache[args]
+        else:
+            value = self.func(*args)
+            self.cache[args] = value
+            return value

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.pyc

Binary file ngsutils/support/ngs_utils.pyc has changed

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/regions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/regions.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,166 @@
+class RangeMatch(object):
+    '''
+    Simple genomic ranges.  You can define chrom:start-end ranges, then ask if a
+    particular genomic coordinate maps to any of those ranges.  This is less-
+    efficient than an R-Tree, but easier to code.
+    '''
+    def __init__(self, name):
+        self.ranges = {}
+        self.name = name
+
+    def add_range(self, chrom, strand, start, end):
+        if not chrom in self.ranges:
+            self.ranges[chrom] = {}
+
+        bin = start / 100000
+        if not bin in self.ranges[chrom]:
+            self.ranges[chrom][bin] = []
+        self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+        if (end / 100000) != bin:
+            for bin in xrange(bin + 1, (end / 100000) + 1):
+                if not bin in self.ranges[chrom]:
+                    self.ranges[chrom][bin] = []
+                self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+    def get_tag(self, chrom, strand, pos, ignore_strand=False):
+        '''
+        returns (region, is_reverse_orientation)
+        '''
+        if not chrom in self.ranges:
+            return None, False
+        bin = pos / 100000
+        if not bin in self.ranges[chrom]:
+            return None, False
+        for start, end, r_strand in self.ranges[chrom][bin]:
+            if pos >= start and pos <= end:
+                if ignore_strand or strand == r_strand:
+                    return self.name, False
+                return self.name, True
+        return None, False
+
+
+class RegionTagger(object):
+    def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
+        self.regions = []
+        self.counts = {}
+        self.only_first_fragment = only_first_fragment
+
+        coding = RangeMatch('coding')
+        exons = RangeMatch('other-exon')
+        utr_5 = RangeMatch('utr-5')
+        utr_3 = RangeMatch('utr-3')
+        introns = RangeMatch('intron')
+        promoters = RangeMatch('promoter')
+
+        for gene in gtf.genes:
+            if valid_chroms and not gene.chrom in valid_chroms:
+                continue
+            if gene.strand == '+':
+                promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
+            else:
+                promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
+
+            for transcript in gene.transcripts:
+                if transcript.has_cds:
+                    for start, end in transcript.cds:
+                        coding.add_range(gene.chrom, gene.strand, start, end)
+
+                    # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
+                    for s, e in transcript.utr_5:
+                        utr_5.add_range(gene.chrom, gene.strand, s, e)
+                    for s, e in transcript.utr_3:
+                        utr_3.add_range(gene.chrom, gene.strand, s, e)
+
+                last_end = None
+                for start, end in transcript.exons:
+                    if last_end:
+                        introns.add_range(gene.chrom, gene.strand, last_end, start)
+                    exons.add_range(gene.chrom, gene.strand, start, end)
+                    last_end = end
+
+
+        self.regions.append(coding)
+        self.regions.append(utr_5)
+        self.regions.append(utr_3)
+        self.regions.append(exons)
+        self.regions.append(introns)
+        self.regions.append(promoters)
+
+        self.counts['coding'] = 0
+        self.counts['coding-rev'] = 0
+        self.counts['other-exon'] = 0
+        self.counts['utr-5'] = 0
+        self.counts['utr-3'] = 0
+        self.counts['utr-5-rev'] = 0
+        self.counts['utr-3-rev'] = 0
+        self.counts['intron'] = 0
+        self.counts['promoter'] = 0
+        self.counts['other-exon-rev'] = 0
+        self.counts['intron-rev'] = 0
+        self.counts['promoter-rev'] = 0
+        self.counts['junction'] = 0
+        self.counts['intergenic'] = 0
+        self.counts['mitochondrial'] = 0
+
+    def add_read(self, read, chrom):
+        if read.is_unmapped:
+            return
+
+        if self.only_first_fragment and read.is_paired and not read.is_read1:
+            return
+
+        tag = None
+        is_rev = False
+
+        strand = '-' if read.is_reverse else '+'
+
+        if chrom == 'chrM':
+            tag = 'mitochondrial'
+
+        if not tag:
+            for op, length in read.cigar:
+                if op == 3:
+                    tag = 'junction'
+                    break
+
+        if not tag:
+            for region in self.regions:
+                tag, is_rev = region.get_tag(chrom, strand, read.pos)
+                if tag:
+                    break
+
+        if not tag:
+            tag = 'intergenic'
+
+        if tag:
+            if is_rev:
+                self.counts['%s-rev' % tag] += 1
+            else:
+                self.counts[tag] += 1
+
+        return tag
+
+    def tag_region(self, chrom, start, end, strand):
+        tag = None
+        is_rev = False
+
+        if chrom == 'chrM' or chrom == 'M':
+            tag = 'mitochondrial'
+
+        if not tag:
+            for region in self.regions:
+                tag, is_rev = region.get_tag(chrom, strand, start)
+                if is_rev:
+                    tag = '%s-rev' % tag
+
+                if start != end:
+                    endtag, is_rev = region.get_tag(chrom, strand, end)
+                    if is_rev:
+                        endtag = '%s-rev' % endtag
+
+                if tag and endtag and endtag != tag:
+                    tag = '%s/%s' % (tag, endtag)
+
+        if not tag:
+            tag = 'intergenic'
\ No newline at end of file

diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/stats.py Wed Nov 11 13:04:07 2015 -0500

[

@@ -0,0 +1,161 @@
+'''
+various statistical tests and methods...
+'''
+import math
+from ngsutils.support import memoize
+
+def median(vals):
+    '''
+    >>> median([1,2,3])
+    2
+    >>> median([1,2,3,4])
+    2.5
+    '''
+    vals.sort()
+
+    if len(vals) % 2 == 1:
+        return vals[len(vals) / 2]
+    else:
+        a = vals[(len(vals) / 2) - 1]
+        b = vals[(len(vals) / 2)]
+        return float(a + b) / 2
+
+
+def mean_stdev(l):
+    '''
+    >>> mean_stdev([1,2,2,2])
+    (1.75, 0.5)
+    >>> mean_stdev([2,2,2,2])
+    (2.0, 0.0)
+    '''
+
+    acc = 0
+    for el in l:
+        acc += el
+
+    mean = float(acc) / len(l)
+    acc = 0
+    for el in l:
+        acc += (el - mean) ** 2
+
+    if len(l) > 2:
+        stdev = math.sqrt(float(acc) / (len(l) - 1))
+    else:
+        stdev = 0.0
+
+    return (mean, stdev)
+
+
+def counts_median(d):
+    '''
+    Calculate the median from counts stored in a dictionary
+    >>> counts_median({ 1: 4, 2: 1, 3: 4 })
+    2
+    >>> counts_median({ 1: 4, 3: 4 })
+    2
+
+    '''
+    count = 0
+    for k in d:
+        count += d[k]
+
+    if count == 0:
+        return 0
+
+    acc = 0.0
+    last = 0
+    for k in sorted(d):
+        if last:
+            return (last + k) / 2
+        acc += d[k]
+        if acc / count == 0.5:
+            last = k
+        elif acc / count > 0.5:
+            return k
+
+
+def counts_mean_stdev(d):
+    '''
+
+    calc mean / stdev when data is stored as counts in a dictionary
+
+    Ex:
+        { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
+
+    >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
+    (2.0, 1.0)
+
+    '''
+
+    acc = 0
+    count = 0
+    for k in d:
+        acc += k * d[k]
+        count += d[k]
+
+    mean = float(acc) / count
+
+    acc = 0
+    for k in d:
+        acc += (((k - mean) ** 2) * d[k])
+
+    if count > 2:
+        stdev = math.sqrt(float(acc) / (count - 1))
+    else:
+        stdev = 0.0
+
+    return (mean, stdev)
+
+@memoize
+def poisson_prob(x, mean):
+    '''
+        Return the probability that you could get x counts in
+        a Poisson test with a mean value.
+
+        prob(x) = sum(i=1..x){poisson(i)}
+
+        >>> poisson_prob(6,10)
+        0.1300960209527205
+        >>> poisson_prob(8,10)
+        0.33277427882095645
+    '''
+    acc = 0.0
+    for i in xrange(1, x+1):
+        acc += poisson_func(i, mean)
+    return acc
+
+@memoize
+def poisson_func(mu, lambd):
+    '''
+        This is the Poisson distribution function
+
+        p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
+
+        mu is a count
+        lambd is the mean
+
+        >>> poisson_func(1,10)
+        0.00045399929762484856
+        >>> poisson_func(2,10)
+        0.0022699964881242427
+        >>> poisson_func(3,10)
+        0.007566654960414142
+    '''
+    return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
+
+
+@memoize
+def _factorial(x):
+    '''
+    >>> _factorial(1)
+    1
+    >>> _factorial(2)
+    2
+    >>> _factorial(3)
+    6
+    '''
+    return math.factorial(x)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()

diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_input1.bam

Binary file test-data/ngsutils_bam_filter_input1.bam has changed

diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result1.bam

Binary file test-data/ngsutils_bam_filter_result1.bam has changed

diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result2.bam

Binary file test-data/ngsutils_bam_filter_result2.bam has changed

diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result3.bam

Binary file test-data/ngsutils_bam_filter_result3.bam has changed

diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result4.bam

Binary file test-data/ngsutils_bam_filter_result4.bam has changed

diff -r 000000000000 -r 4e4e4093d65d tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Nov 11 13:04:07 2015 -0500

@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="pysam" version="0.7.7">
+        <repository changeset_revision="0a5141bdf9d0" name="package_pysam_0_7_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>