# HG changeset patch
# User iuc
# Date 1447265047 18000
# Node ID 4e4e4093d65d3c7377067f5d7499d454b228aca7
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
diff -r 000000000000 -r 4e4e4093d65d bam_filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bam_filter.xml Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,231 @@
+
+ Removes reads from a BAM file based on criteria
+
+ macros.xml
+
+
+
+
+
+
+
+
+ -1:
+ -uniq
+ #if int($uniq) > 0:
+ $uniq
+ #end if
+ #end if
+ $uniq_start
+ #if $mismatch:
+ -mismatch $mismatch
+ #end if
+ $nosecondary
+ $noqcfail
+ $nopcrdup
+ #if $excludebed:
+ -excludebed "${excludebed}" $ignore_strand
+ #end if
+ #if $includebed:
+ -includebed "${includebed}" $ignore_strand
+ #end if
+ #if $includeref:
+ -includeref "${includeref}"
+ #end if
+ #if $excluderef:
+ -excluderef "${excluderef}"
+ #end if
+ #if $maximum_mismatch_ratio
+ -maximum_mismatch_ratio $maximum_mismatch_ratio
+ #end if
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 4e4e4093d65d filter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,986 @@
+#!/usr/bin/env python
+## category General
+## desc Removes reads from a BAM file based on criteria
+"""
+Removes reads from a BAM file based on criteria
+
+Given a BAM file, this script will only allow reads that meet filtering
+criteria to be written to output. The output is another BAM file with the
+reads not matching the criteria removed.
+
+Note: this does not adjust tag values reflecting any filtering. (for example:
+ if a read mapped to two locations (IH:i:2), and one was removed by
+ filtering, the IH:i tag would still read IH:i:2).
+
+Currently, the available filters are:
+ -minlen val Remove reads that are smaller than {val}
+ -maxlen val Remove reads that are larger than {val}
+ -mapped Keep only mapped reads
+ -unmapped Keep only unmapped reads
+ -properpair Keep only properly paired reads (both mapped,
+ correct orientation, flag set in BAM)
+ -noproperpair Keep only not-properly paired reads
+
+ -mask bitmask Remove reads that match the mask (base 10/hex)
+ -uniq {length} Remove reads that are have the same sequence
+ Note: BAM file should be sorted
+ (up to an optional length)
+ -uniq_start Remove reads that start at the same position
+ Note: BAM file should be sorted
+ (Use only for low-coverage samples)
+
+ -mismatch num # mismatches or indels
+ indel always counts as 1 regardless of length
+ (requires NM tag in reads)
+
+ -mismatch_dbsnp num dbsnp.txt.bgz
+ # mismatches or indels - not in dbSNP.
+ Variations are called using the MD tag.
+ Variations that are found in the dbSNP list are
+ not counted as mismatches. The dbSNP list is a
+ Tabix-indexed dump of dbSNP (from UCSC Genome
+ Browser). Indels in dbSNP are also counted.
+ Adds a 'ZS:i' tag with the number of found SNPs
+ in the read.
+ (requires NM and MD tags)
+
+ Example command for indexing:
+ ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0
+
+ -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches
+ directly in a reference FASTA file
+ (use if NM tag not present)
+
+ -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz
+ # mismatches or indels - looks up mismatches
+ directly from a reference FASTA file. (See
+ -mismatch_dbsnp for dbSNP matching)
+ (use if NM or MD tag not present)
+
+ -nosecondary Remove reads that have the 0x100 flag set
+ -noqcfail Remove reads that have the 0x200 flag set
+ -nopcrdup Remove reads that have the 0x400 flag set
+
+
+ -exclude ref:start-end Remove reads in this region (1-based start)
+ -excludebed file.bed {nostrand}
+ Remove reads that are in any of the regions
+ from the given BED file. If 'nostrand' is given,
+ strand information from the BED file is ignored.
+
+ -include ref:start-end Remove reads NOT in the region (can only be one)
+ -includebed file.bed {nostrand}
+ Remove reads that are NOT any of the regions
+ from the given BED file. If 'nostrand' is given,
+ strand information from the BED file is ignored.
+
+ Note: If this is a large dataset, use
+ "bamutils extract" instead.
+
+ -includeref refname Exclude reads NOT mapped to a reference
+ -excluderef refname Exclude reads mapped to a particular reference
+ (e.g. chrM, or _dup chromosomes)
+
+ -whitelist fname Remove reads that aren't on this list (by name)
+ -blacklist fname Remove reads that are on this list (by name)
+ These lists can be whitespace-delimited with
+ the read name as the first column.
+ -maximum_mismatch_ratio val
+ Filter by maximum mismatch ratio (fraction of length)
+
+ -eq tag_name value
+ -lt tag_name value
+ -lte tag_name value
+ -gt tag_name value
+ -gte tag_name value
+
+ As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ
+ value will be used.
+
+Common tags to filter by:
+ AS Alignment score
+ IH Number of alignments
+ NM Edit distance (each indel counts as many as its length)
+
+ MAPQ Mapping quality (defined as part of SAM spec)
+
+ The tag type (:i, :f, :Z) is optional.
+
+"""
+
+import os
+import sys
+import pysam
+from ngsutils.bam import bam_iter
+from ngsutils.support.dbsnp import DBSNP
+from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations
+from ngsutils.bed import BedFile
+
+
+def usage():
+ print __doc__
+ print """
+Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria...
+
+Options:
+ -failed fname A text file containing the read names of all reads
+ that were removed with filtering
+
+Example:
+bamutils filter filename.bam output.bam -mapped -gte AS:i 1000
+
+This will remove all unmapped reads, as well as any reads that have an AS:i
+value less than 1000.
+"""
+ sys.exit(1)
+
+
+class Unique(object):
+ def __init__(self, length=None):
+ if length:
+ self.length = int(length)
+ else:
+ self.length = None
+
+ self.last_pos = None
+ self.pos_reads = set()
+
+ def __repr__(self):
+ return "uniq"
+
+ def filter(self, bam, read):
+ if self.last_pos != (read.tid, read.pos):
+ self.last_pos = (read.tid, read.pos)
+ self.pos_reads = set()
+
+ if read.is_reverse:
+ seq = read.seq[::-1] # ignore revcomp for now, it isn't needed, just need to compare in the proper order
+ else:
+ seq = read.seq
+
+ if self.length:
+ seq = seq[:self.length]
+
+ if seq in self.pos_reads:
+ return False
+
+ self.pos_reads.add(seq)
+ return True
+
+ def close(self):
+ pass
+
+
+class UniqueStart(object):
+ def __init__(self):
+ self.last_tid = None
+ self.last_fwd_pos = -1
+ self.rev_pos_list = None
+
+ def __repr__(self):
+ return "uniq_start"
+
+ def filter(self, bam, read):
+ if self.last_tid is None or self.last_tid != read.tid:
+ self.last_tid = read.tid
+ self.rev_pos = set()
+ self.last_fwd_pos = -1
+ self.last_rev_pos = -1
+
+ if read.is_reverse:
+ # check reverse reads from their start (3' aend)
+ # these aren't necessarily in the correct
+ # order in the file, so we have to track them in a set
+
+ start_pos = read.aend
+
+ # clean up hash if necesary
+ # Allow up to 100k over, to balance cleaning up the rev_pos hash
+ # and memory
+ if read.pos > (self.last_rev_pos + 100000):
+ self.last_rev_pos = start_pos
+ del_list = []
+ for k in self.rev_pos:
+ if k < read.pos:
+ del_list.append(k)
+
+ for k in del_list:
+ self.rev_pos.remove(k)
+
+ if not start_pos in self.rev_pos:
+ self.rev_pos.add(start_pos)
+ return True
+ return False
+ else:
+ if read.pos != self.last_fwd_pos:
+ self.last_fwd_pos = read.pos
+ return True
+ return False
+
+ # if self.last_pos != (read.tid, read.pos):
+ # self.last_pos = (read.tid, read.pos)
+ # return True
+ # return False
+
+ def close(self):
+ pass
+
+
+class Blacklist(object):
+ def __init__(self, fname):
+ self.fname = fname
+ self.notallowed = set()
+ with open(fname) as f:
+ for line in f:
+ self.notallowed.add(line.strip().split()[0])
+
+ def filter(self, bam, read):
+ return read.qname not in self.notallowed
+
+ def __repr__(self):
+ return 'Blacklist: %s' % (self.fname)
+
+ def close(self):
+ pass
+
+
+class Whitelist(object):
+ def __init__(self, fname):
+ self.fname = fname
+ self.allowed = set()
+ with open(fname) as f:
+ for line in f:
+ self.allowed.add(line.strip().split()[0])
+
+ def filter(self, bam, read):
+ return read.qname in self.allowed
+
+ def __repr__(self):
+ return 'Whitelist: %s' % (self.fname)
+
+ def close(self):
+ pass
+
+
+class IncludeRegion(object):
+ _excludes = []
+ _last = None
+
+ def __init__(self, region):
+ IncludeRegion._excludes.append(ExcludeRegion(region))
+ # self.excl = ExcludeRegion(region)
+
+ def filter(self, bam, read):
+ if read == IncludeRegion._last:
+ return True
+
+ IncludeRegion._last = read
+
+ for excl in IncludeRegion._excludes:
+ if not excl.filter(bam, read):
+ return True
+ return False
+
+ # return not self.excl.filter(bam,read)
+ def __repr__(self):
+ return 'Including: %s' % (self.excl.region)
+
+ def close(self):
+ pass
+
+
+class IncludeBED(object):
+ def __init__(self, fname, nostrand=None):
+ self.excl = ExcludeBED(fname, nostrand)
+
+ def filter(self, bam, read):
+ return not self.excl.filter(bam, read)
+
+ def __repr__(self):
+ return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '')
+
+ def close(self):
+ pass
+
+
+class ExcludeRegion(object):
+ def __init__(self, region):
+ self.region = region
+ spl = region.split(':')
+ self.chrom = spl[0]
+ se = [int(x) for x in spl[1].split('-')]
+ self.start = se[0] - 1
+ self.end = se[1]
+
+ def filter(self, bam, read):
+ if not read.is_unmapped:
+ if bam.getrname(read.tid) == self.chrom:
+ if self.start <= read.pos <= self.end:
+ return False
+ if self.start <= read.aend <= self.end:
+ return False
+ return True
+
+ def __repr__(self):
+ return 'Excluding: %s' % (self.region)
+
+ def close(self):
+ pass
+
+
+class ExcludeRef(object):
+ def __init__(self, ref):
+ self.ref = ref
+
+ def filter(self, bam, read):
+ if not read.is_unmapped:
+ if bam.getrname(read.tid) == self.ref:
+ return False
+ return True
+
+ def __repr__(self):
+ return 'Excluding: %s' % (self.ref)
+
+ def close(self):
+ pass
+
+class IncludeRef(object):
+ def __init__(self, ref):
+ self.ref = ref
+
+ def filter(self, bam, read):
+ if not read.is_unmapped:
+ if bam.getrname(read.tid) == self.ref:
+ return True
+ return False
+
+ def __repr__(self):
+ return 'Including: %s' % (self.ref)
+
+ def close(self):
+ pass
+
+
+class ExcludeBED(object):
+ def __init__(self, fname, nostrand=None):
+ self.regions = {} # store BED regions as keyed bins (chrom, bin)
+ self.fname = fname
+ if nostrand == 'nostrand':
+ self.nostrand = True
+ else:
+ self.nostrand = False
+
+ self.bed = BedFile(fname)
+ # with open(fname) as f:
+ # for line in f:
+ # if not line:
+ # continue
+ # if line[0] == '#':
+ # continue
+ # cols = line.strip().split('\t')
+
+ # chrom = cols[0]
+ # start = int(cols[1])
+ # end = int(cols[2])
+ # if self.nostrand:
+ # strand = '?'
+ # else:
+ # strand = cols[5]
+
+ # startbin = start / 100000
+ # endbin = end / 100000
+
+ # for bin in xrange(startbin, endbin + 1):
+ # if not (chrom, bin) in self.regions:
+ # self.regions[(chrom, bin)] = []
+ # self.regions[(chrom, bin)].append((start, end, strand))
+
+ def filter(self, bam, read):
+ if not read.is_unmapped:
+ if self.nostrand:
+ strand = None
+ elif read.is_reverse:
+ strand = '-'
+ else:
+ strand = '+'
+
+ for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand):
+ # region found, exclude read
+ return False
+ return True
+
+ # bin = read.pos / 100000
+ # ref = bam.getrname(read.tid)
+
+ # if not (ref, bin) in self.regions:
+ # return True
+
+ # for start, end, strand in self.regions[(ref, bin)]:
+ # if not self.nostrand:
+ # if strand == '+' and read.is_reverse:
+ # continue
+ # if strand == '-' and not read.is_reverse:
+ # continue
+ # if start <= read.pos <= end:
+ # return False
+ # if start <= read.aend <= end:
+ # return False
+ # return True
+
+ def __repr__(self):
+ return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '')
+
+ def close(self):
+ pass
+
+
+class Mismatch(object):
+ def __init__(self, num):
+ self.num = int(num)
+
+ def filter(self, bam, read):
+ if read.is_unmapped:
+ return False
+
+ if read_calc_mismatches(read) > self.num:
+ return False
+
+ return True
+
+ def __repr__(self):
+ return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es')
+
+ def close(self):
+ pass
+
+
+class MismatchRef(object):
+ def __init__(self, num, refname):
+ self.num = int(num)
+ self.refname = refname
+
+ if not os.path.exists('%s.fai' % refname):
+ pysam.faidx(refname)
+
+ self.ref = pysam.Fastafile(refname)
+
+ def filter(self, bam, read):
+ if read.is_unmapped:
+ return False
+
+ chrom = bam.getrname(read.tid)
+ if read_calc_mismatches_ref(self.ref, read, chrom) > self.num:
+ return False
+
+ return True
+
+ def __repr__(self):
+ return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname))
+
+ def close(self):
+ self.ref.close()
+
+
+class MismatchDbSNP(object):
+ def __init__(self, num, fname, verbose=None):
+ sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n')
+
+ self.num = int(num)
+ self.fname = fname
+ self.dbsnp = DBSNP(fname)
+ if verbose == 'verbose':
+ self.verbose = True
+ else:
+ self.verbose = False
+
+ def filter(self, bam, read):
+ if read.is_unmapped:
+ return False
+
+ if read_calc_mismatches(read) <= self.num:
+ return True
+
+ chrom = bam.getrname(read.tid)
+
+ mm = 0
+ snps = 0
+
+ for op, pos, seq in read_calc_variations(read):
+ if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose):
+ mm += 1
+ else:
+ snps += 1
+
+ if mm > self.num:
+ return False
+
+ if snps:
+ read.tags = read.tags + [('ZS', snps)]
+
+ return True
+
+ def __repr__(self):
+ return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname))
+
+ def close(self):
+ self.dbsnp.close()
+
+
+class MismatchRefDbSNP(object):
+ def __init__(self, num, refname, dbsnpname):
+ sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n')
+ self.num = int(num)
+ self.refname = refname
+ self.dbsnp = DBSNP(dbsnpname)
+
+ if not os.path.exists('%s.fai' % refname):
+ pysam.faidx(refname)
+
+ self.ref = pysam.Fastafile(refname)
+
+ def filter(self, bam, read):
+ if read.is_unmapped:
+ return False
+
+ chrom = bam.getrname(read.tid)
+
+ mm = 0
+ snps = 0
+
+ for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom):
+ if not self.dbsnp.is_valid_variation(chrom, op, pos, seq):
+ mm += 1
+ else:
+ snps += 1
+
+ if mm > self.num:
+ return False
+
+ if snps:
+ read.tags = read.tags + [('ZS', snps)]
+
+ return True
+
+ def __repr__(self):
+ return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname))
+
+ def close(self):
+ self.ref.close()
+ self.dbsnp.close()
+
+
+class Mapped(object):
+ def __init__(self):
+ pass
+
+ def filter(self, bam, read):
+ if read.is_paired and (read.is_unmapped or read.mate_is_unmapped):
+ return False
+ elif read.is_unmapped:
+ return False
+ return True
+
+ def __repr__(self):
+ return 'is mapped'
+
+ def close(self):
+ pass
+
+
+class Unmapped(object):
+ def __init__(self):
+ pass
+
+ def filter(self, bam, read):
+ if read.is_paired and (read.is_unmapped or read.mate_is_unmapped):
+ return True
+ elif read.is_unmapped:
+ return True
+ return False
+
+ def __repr__(self):
+ return 'is unmapped'
+
+ def close(self):
+ pass
+
+
+class ProperPair(object):
+ def __init__(self):
+ pass
+
+ def filter(self, bam, read):
+ if not read.is_paired:
+ return False
+
+ if read.is_unmapped or read.mate_is_unmapped:
+ return False
+
+ if read.is_reverse == read.mate_is_reverse:
+ return False
+
+ return read.is_proper_pair
+
+ def __repr__(self):
+ return 'proper pair'
+
+ def close(self):
+ pass
+
+
+class NoProperPair(object):
+ def __init__(self):
+ self.proper = ProperPair()
+ pass
+
+ def filter(self, bam, read):
+ return not self.proper.filter(bam, read)
+
+ def __repr__(self):
+ return 'not proper pairs'
+
+ def close(self):
+ pass
+
+
+class MaskFlag(object):
+ def __init__(self, value):
+ if type(value) == type(1):
+ self.flag = value
+ else:
+ if value[0:2] == '0x':
+ self.flag = int(value, 16)
+ else:
+ self.flag = int(value)
+
+ def __repr__(self):
+ return "Doesn't match flag: %s" % self.flag
+
+ def filter(self, bam, read):
+ return (read.flag & self.flag) == 0
+
+ def close(self):
+ pass
+
+
+class SecondaryFlag(object):
+ def __repr__(self):
+ return "no 0x100 (secondary) flag"
+
+ def filter(self, bam, read):
+ return not read.is_secondary
+
+ def close(self):
+ pass
+
+
+class ReadMinLength(object):
+ def __init__(self, minval):
+ self.minval = int(minval)
+
+ def __repr__(self):
+ return "read length min: %s" % self.minval
+
+ def filter(self, bam, read):
+ return len(read.seq) >= self.minval
+
+ def close(self):
+ pass
+
+
+class ReadMaxLength(object):
+ def __init__(self, val):
+ self.val = int(val)
+
+ def __repr__(self):
+ return "read length max: %s" % self.val
+
+ def filter(self, bam, read):
+ return len(read.seq) <= self.val
+
+ def close(self):
+ pass
+
+
+class MaximumMismatchRatio(object):
+ def __init__(self, ratio):
+ self.ratio = float(ratio)
+
+ def __repr__(self):
+ return "maximum mismatch ratio: %s" % self.val
+
+ def filter(self, bam, read):
+ return read_calc_mismatches(read) <= self.ratio*len(read.seq)
+
+ def close(self):
+ pass
+
+
+class QCFailFlag(object):
+ def __repr__(self):
+ return "no 0x200 (qcfail) flag"
+
+ def filter(self, bam, read):
+ return not read.is_qcfail
+
+ def close(self):
+ pass
+
+
+class PCRDupFlag(object):
+ def __repr__(self):
+ return "no 0x400 (pcrdup) flag"
+
+ def filter(self, bam, read):
+ return not read.is_duplicate
+
+ def close(self):
+ pass
+
+
+class _TagCompare(object):
+ def __init__(self, tag, value):
+ self.args = '%s %s' % (tag, value)
+
+ if ':' in tag:
+ self.tag = tag.split(':')[0]
+ tagtype = tag.split(':')[1]
+ if tagtype == 'i':
+ self.value = int(value)
+ elif tagtype == 'f':
+ self.value = float(value)
+ elif tagtype == 'H':
+ self.value = int(value) # not sure about this
+ else:
+ self.value = value
+ else:
+ self.tag = tag
+
+ # guess at type...
+ try:
+ self.value = int(value)
+ except:
+ try:
+ self.value = float(value)
+ except:
+ self.value = value
+
+ def get_value(self, read):
+ if self.tag == 'MAPQ':
+ return read.mapq
+
+ for name, value in read.tags:
+ if name == self.tag:
+ return value
+
+ return None
+
+ def __repr__(self):
+ return "%s %s %s" % (self.tag, self.__class__.op, self.value)
+
+ def close(self):
+ pass
+
+
+class TagLessThan(_TagCompare):
+ op = '<'
+
+ def filter(self, bam, read):
+ if self.get_value(read) < self.value:
+ return True
+ return False
+
+
+class TagLessThanEqual(_TagCompare):
+ op = '<='
+
+ def filter(self, bam, read):
+ if self.get_value(read) <= self.value:
+ return True
+ return False
+
+
+class TagGreaterThan(_TagCompare):
+ op = '>'
+
+ def filter(self, bam, read):
+ if self.get_value(read) > self.value:
+ return True
+ return False
+
+
+class TagGreaterThanEqual(_TagCompare):
+ op = '>='
+
+ def filter(self, bam, read):
+ if self.get_value(read) >= self.value:
+ return True
+ return False
+
+
+class TagEqual(_TagCompare):
+ op = '='
+
+ def filter(self, bam, read):
+ if self.get_value(read) == self.value:
+ return True
+ return False
+
+_criteria = {
+ 'mapped': Mapped,
+ 'unmapped': Unmapped,
+ 'properpair': ProperPair,
+ 'noproperpair': NoProperPair,
+ 'noqcfail': QCFailFlag,
+ 'nosecondary': SecondaryFlag,
+ 'nopcrdup': PCRDupFlag,
+ 'mask': MaskFlag,
+ 'lt': TagLessThan,
+ 'gt': TagGreaterThan,
+ 'lte': TagLessThanEqual,
+ 'gte': TagGreaterThanEqual,
+ 'eq': TagEqual,
+ 'mismatch': Mismatch,
+ 'mismatch_ref': MismatchRef,
+ 'mismatch_dbsnp': MismatchDbSNP,
+ 'mismatch_ref_dbsnp': MismatchRefDbSNP,
+ 'exclude': ExcludeRegion,
+ 'excludebed': ExcludeBED,
+ 'excluderef': ExcludeRef,
+ 'includeref': IncludeRef,
+ 'include': IncludeRegion,
+ 'includebed': IncludeBED,
+ 'whitelist': Whitelist,
+ 'blacklist': Blacklist,
+ 'length': ReadMinLength, # deprecated
+ 'minlen': ReadMinLength,
+ 'maxlen': ReadMaxLength,
+ 'uniq': Unique,
+ 'maximum_mismatch_ratio': MaximumMismatchRatio
+}
+
+
+def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False):
+ if verbose:
+ sys.stderr.write('Input file : %s\n' % infile)
+ sys.stderr.write('Output file : %s\n' % outfile)
+ if failedfile:
+ sys.stderr.write('Failed reads: %s\n' % failedfile)
+ sys.stderr.write('Criteria:\n')
+ for criterion in criteria:
+ sys.stderr.write(' %s\n' % criterion)
+
+ sys.stderr.write('\n')
+
+ bamfile = pysam.Samfile(infile, "rb")
+ outfile = pysam.Samfile(outfile, "wb", template=bamfile)
+
+ if failedfile:
+ failed_out = open(failedfile, 'w')
+ else:
+ failed_out = None
+
+ passed = 0
+ failed = 0
+
+ def _callback(read):
+ return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed)
+
+ for read in bam_iter(bamfile, quiet=True):
+ p = True
+
+ for criterion in criteria:
+ if not criterion.filter(bamfile, read):
+ p = False
+ failed += 1
+ if failed_out:
+ failed_out.write('%s\t%s\n' % (read.qname, criterion))
+ #outfile.write(read_to_unmapped(read))
+ break
+ if p:
+ passed += 1
+ outfile.write(read)
+
+ bamfile.close()
+ outfile.close()
+ if failed_out:
+ failed_out.close()
+ sys.stdout.write("%s kept\n%s failed\n" % (passed, failed))
+
+ for criterion in criteria:
+ criterion.close()
+
+
+def read_to_unmapped(read):
+ '''
+ Example unmapped read
+ 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4
+
+ Example mapped read:
+ 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2----------------------------
+
+ '''
+ # TODO: rewrite read properties to unmapped state
+ # if colorspace: convert CS to NT directly
+ # remove excess tags
+
+ read.is_unmapped = True
+ read.tid = None
+ read.pos = 0
+ read.mapq = 0
+ return read
+
+if __name__ == '__main__':
+ infile = None
+ outfile = None
+ failed = None
+ criteria = []
+
+ crit_args = []
+ last = None
+ verbose = False
+ fail = False
+
+ for arg in sys.argv[1:]:
+ if last == '-failed':
+ failed = arg
+ last = None
+ elif arg == '-h':
+ usage()
+ elif arg == '-failed':
+ last = arg
+ elif arg == '-v':
+ verbose = True
+ elif not infile and os.path.exists(arg):
+ infile = arg
+ elif not outfile:
+ outfile = arg
+ elif arg[0] == '-':
+ if not arg[1:] in _criteria:
+ print "Unknown criterion: %s" % arg
+ fail = True
+ if crit_args:
+ criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
+ crit_args = [arg, ]
+ elif crit_args:
+ crit_args.append(arg)
+ else:
+ print "Unknown argument: %s" % arg
+ fail = True
+
+ if not fail and crit_args:
+ criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:]))
+
+ if fail or not infile or not outfile or not criteria:
+ if not infile and not outfile and not criteria:
+ usage()
+
+ if not infile:
+ print "Missing: input bamfile"
+ if not outfile:
+ print "Missing: output bamfile"
+ if not criteria:
+ print "Missing: filtering criteria"
+ usage()
+ else:
+ bam_filter(infile, outfile, criteria, failed, verbose)
diff -r 000000000000 -r 4e4e4093d65d macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,16 @@
+
+ 0.5.7
+
+
+ pysam
+
+
+
+ @WRAPPER_VERSION@
+
+
+
+
+
+
+
diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.py
diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.pyc
Binary file ngsutils/__init__.pyc has changed
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bam/__init__.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,879 @@
+import sys
+import os
+import re
+import pysam
+try:
+ from eta import ETA
+except:
+ pass
+import ngsutils.support
+
+
+def bam_open(fname, mode='r', *args, **kwargs):
+ if fname.lower()[-4:] == '.bam':
+ return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)
+ return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)
+
+
+def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
+ if not quiet and bam.filename:
+ eta = ETA(os.stat(bam.filename).st_size)
+ else:
+ eta = None
+
+ for pileup in bam.pileup(mask=mask):
+ pos = bam.tell()
+ bgz_offset = pos >> 16
+
+ if not quiet:
+ if callback:
+ eta.print_status(bgz_offset, extra=callback(pileup))
+ else:
+ eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))
+
+ yield pileup
+
+ if eta:
+ eta.done()
+
+
+def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
+ '''
+ >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
+ ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
+ '''
+
+ if os.path.exists('%s.bai' % bam.filename):
+ # This is an indexed file, so it is ref sorted...
+ # Meaning that we should show chrom:pos, instead of read names
+ show_ref_pos = True
+
+ eta = None
+
+ if not ref:
+ if not quiet and bam.filename:
+ eta = ETA(os.stat(bam.filename).st_size)
+
+ for read in bam:
+ pos = bam.tell()
+ bgz_offset = pos >> 16
+
+ if not quiet and eta:
+ if callback:
+ eta.print_status(bgz_offset, extra=callback(read))
+ elif (show_ref_pos):
+ if read.tid > -1:
+ eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
+ else:
+ eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
+ else:
+ eta.print_status(bgz_offset, extra='%s' % read.qname)
+
+ yield read
+
+ else:
+ working_chrom = None
+ if ref in bam.references:
+ working_chrom = ref
+ elif ref[0:3] == 'chr':
+ # compensate for Ensembl vs UCSC ref naming
+ if ref[3:] in bam.references:
+ working_chrom = ref[3:]
+
+ if not working_chrom:
+ raise ValueError('Missing reference: %s' % ref)
+
+ tid = bam.gettid(working_chrom)
+
+ if not start:
+ start = 0
+ if not end:
+ end = bam.lengths[tid]
+
+ if not quiet and bam.filename:
+ eta = ETA(end - start)
+
+ for read in bam.fetch(working_chrom, start, end):
+ if not quiet and eta:
+ if callback:
+ eta.print_status(read.pos - start, extra=callback(read))
+ else:
+ eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
+
+ yield read
+
+ if eta:
+ eta.done()
+
+
+def bam_batch_reads(bam, quiet=False):
+ '''
+ Batch mapping for the same reads (qname) together, this way
+ they can all be compared/converted together.
+ '''
+ reads = []
+ last = None
+ for read in bam_iter(bam, quiet=quiet):
+ if last and read.qname != last:
+ yield reads
+ reads = []
+ last = read.qname
+ reads.append(read)
+
+ if reads:
+ yield reads
+
+
+bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']
+bam_cigar_op = {
+ 'M': 0,
+ 'I': 1,
+ 'D': 2,
+ 'N': 3,
+ 'S': 4,
+ 'H': 5,
+ 'P': 6,
+ '=': 7,
+ 'X': 8,
+}
+
+
+def cigar_fromstr(s):
+ '''
+ >>> cigar_fromstr('10M5I20M')
+ [(0, 10), (1, 5), (0, 20)]
+ >>> cigar_fromstr('1M1I1D1N1S1H1P')
+ [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
+ '''
+ ret = []
+ spl = re.split('([0-9]+)', s)[1:]
+ for op, size in zip(spl[1::2], spl[::2]):
+ ret.append((bam_cigar_op[op], int(size)))
+ return ret
+
+
+def cigar_tostr(cigar):
+ '''
+ >>> cigar_tostr(((0, 10), (1, 5), (0, 20)))
+ '10M5I20M'
+ >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)))
+ '1M1I1D1N1S1H1P'
+ '''
+
+ s = ''
+
+ for op, size in cigar:
+ s += '%s%s' % (size, bam_cigar[op])
+
+ return s
+
+
+def cigar_read_len(cigar):
+ '''
+ >>> cigar_read_len(cigar_fromstr('8M'))
+ 8
+ >>> cigar_read_len(cigar_fromstr('8M100N8M'))
+ 16
+ >>> cigar_read_len(cigar_fromstr('8M10I8M'))
+ 26
+ >>> cigar_read_len(cigar_fromstr('8M10D8M'))
+ 16
+ '''
+
+ read_pos = 0
+
+ for op, length in cigar:
+ if op == 0: # M
+ read_pos += length
+ elif op == 1: # I
+ read_pos += length
+ elif op == 2: # D
+ pass
+ elif op == 3: # N
+ pass
+ elif op == 4: # S
+ read_pos += length
+ elif op == 5: # H
+ pass
+ elif op == 7: # =
+ read_pos += length
+ elif op == 8: # X
+ read_pos += length
+ else:
+ raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+ return read_pos
+
+
+def read_calc_mismatches(read):
+ inserts = 0
+ deletions = 0
+ indels = 0
+ edits = int(read.opt('NM'))
+ #
+ # NM counts the length of indels
+ # We really just care about *if* there is an indel, not the size
+ #
+
+ for op, length in read.cigar:
+ if op == 1:
+ inserts += length
+ indels += 1
+ elif op == 2:
+ deletions += length
+ indels += 1
+
+ return edits - inserts - deletions + indels
+
+
+def _extract_md_matches(md, maxlength):
+ md_pos = 0
+
+ while md and md_pos < maxlength:
+ tmp = '0' # preload a zero so that immediate mismatches will be caught
+ # the zero will have no affect otherwise...
+
+ # look for matches
+ while md and md[0] in '0123456789':
+ tmp += md[0]
+ md = md[1:]
+
+ pos = int(tmp)
+ if pos > maxlength:
+ return (maxlength, '%s%s' % (pos - maxlength, md))
+ return (pos, md)
+
+
+def read_calc_variations(read):
+ 'see _read_calc_variations'
+ for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq):
+ yield tup
+
+
+def _read_calc_variations(start_pos, cigar, md, seq):
+ '''
+ For each variation, outputs a tuple: (op, pos, seq)
+
+ op - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR)
+ pos - 0-based position of the variation (relative to reference)
+ seq - the base (or bases) involved in the variation
+ for mismatch or insert, this is the sequence inserted
+ for deletions, this is the reference sequence that was removed
+
+ MD is the mismatch string. Not all aligners include the tag. If your aligner
+ doesn't include this, then you'll need to add it, or use a different function
+ (see: read_calc_mismatches_gen).
+
+ Special care must be used to handle RNAseq reads that cross
+ an exon-exon junction.
+
+ Also: MD is a *really* dumb format that can't be read correctly with
+ a regex. It must be processed in concert with the CIGAR alignment
+ in order to catch all edge cases. Some implementations insert 0's
+ at the end of inserts / deltions / variations to make parsing easier
+ but not everyone follows this. Look at the complex examples: the
+ CIGAR alignment may show an insert, but the MD just shows all matches.
+
+ Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/
+ Also from CCBB actual mappings and manual altered (shortened,
+ made more complex)
+ (doctests included)
+
+ Match/mismatch
+ CIGAR: 36M
+ MD:Z: 1A0C0C0C1T0C0T27
+ MD:Z: 1ACCC1TCT27 (alternative)
+ 1 2
+ 123456789012345678901234567890123456
+ ref: CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG
+ XXXX XXX
+ read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG
+ MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+ -ACCC-TCT---------------------------
+ >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG'))
+ [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')]
+
+ Insert
+ CIGAR: 6M1I29M
+ MD:Z: 0C1C0C1C0T0C27
+ C1CC1CTC27 (alt)
+ 1 2
+ 123456^789012345678901234567890123456
+ ref: CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT
+ X XX X|XX
+ read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT
+ MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+ G-GA-GGGG---------------------------
+ >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
+ [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
+ >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
+ [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
+
+
+ Deletion
+ CIGAR: 9M9D27M
+ MD:Z: 2G0A5^ATGATGTCA27
+ 2GA5^ATGATGTCA27 (alt)
+ ref: AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC
+ XX ^^^^^^^^^
+ read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC
+ MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
+ --TG-----ATGATGTCA---------------------------
+ >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
+ [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')]
+
+
+ Complex
+ CIGAR: 9M9D11M1I15M
+ MD:Z: 2G0A5^ATGATGTCAA26
+ MD:Z: 2G0A5^ATGATGTCA0G26 (alt)
+ 1 2 3 4
+ pos: 123456789012345678901234567890123456789012345
+ ref: AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC
+ XX ^^^^^^^^^X |
+ read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC
+ MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
+ --TG-----ATGATGTCAG----------T---------------
+ >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
+ [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')]
+
+
+ Complex example - inserts aren't separately handled by MD, only visible in CIGAR
+ CIGAR: 14M2D16M3I42M
+ MD:Z: 14^TC58
+ 1 2 3 4 5 6 7
+ pos: 12345678901234567890123456789012^^^345678901234567890123456789012345678901234567
+ ref: caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct
+ ^^ |||
+ read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT
+ MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+ --------------tc----------------TGG------------------------------------------
+ >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT'))
+ [(2, 15, 'TC'), (1, 33, 'TGG')]
+
+
+ Complex example 2:
+ CIGAR: 41M3I10M1I5M1I2M2I10M
+ MD:Z: 44C2C6T6T6
+ 1 2 3 4 5 6
+ pos: 12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678
+ ref: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
+ ||| X X | X | || X
+ read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
+ MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
+ -----------------------------------------tta---A--T---c---A----gt---G------
+
+
+ 13M 28M 3I 10M 1I 5M 1I 2M 2I 10M
+ >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
+ [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')]
+
+
+ Splice junction example:
+ CIGAR: 62M100N13M
+ MD:Z: 2T27C44
+ 1 1
+ 1 2 3 4 5 6 6 7
+ pos: 12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345
+ ref: CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
+ X X
+ read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
+ --G---------------------------T-----------------------------------------------------
+
+ >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC'))
+ [(0, 3, 'G'), (0, 31, 'T')]
+
+
+ Splice junction example 2:
+ CIGAR: 13M100N28M3I10M1I5M1I2M2I10M
+ MD:Z: 44C2C6T6T6
+ 1 1 1 1 1
+ 1 2 3 4 5 6
+ pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
+ ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
+ ||| X X | X | || X
+ read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
+ MMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
+ ------------- ----------------------------tta---A--T---c---A----gt---G------
+
+ 13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M
+ >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
+ [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
+
+
+ Splice junction example 2A:
+ CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M
+ MD:Z: 9A10^GG22C2C6T6T6
+ 1 1 1 1 1
+ 1 2 3 4 5 6
+ pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
+ ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
+ ^^ ||| X X | X | || X
+ read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
+ MMMMMMMMMMMMM MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
+ ---------C--- ----------------------------tta---A--T---c---A----gt---G------
+ .........A... .......GG................... ...C..C... ...T. .. ...T......
+ 9 A 10 ^GG 22 C 2C 6 T 6 T 6
+
+ >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
+ [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
+
+ Real Example
+ 242_1071_1799_B1
+ CIGAR: 42M10I3M1D9M1D11M
+ MD:Z: 27G16A0^T6C2^T1C9
+ 1 2 3 4 5 6 7
+ pos: 123456789012345678901234567890123456789012 345678901234567890123456789012345
+ ref: ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG
+ X X^ X ^ X
+ read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG
+ MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM
+ ---------------------------A--------------^^^^^^^^^^--CT------G--T-G---------
+
+ >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG', ))
+ [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')]
+
+
+ Real example 2
+ 577_1692_891_A1
+ CIGAR: 34M100N39M2I
+ MD:Z: 3T69
+ 1 1 1 1
+ 1 2 3 4 5 6 7
+ pos: 1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123
+ ref: GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^
+
+ read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC
+ MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNN MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII
+ ---A------------------------------ ---------------------------------------TC
+
+ >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC', ))
+ [(0, 4, 'A'), (1, 174, 'TC')]
+
+ '''
+
+ ref_pos = start_pos
+ read_pos = 0
+
+ for op, length in cigar:
+ if md and md[0] == '0':
+ md = md[1:]
+ # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
+ if op == 0: # M
+ # how far in the chunk are we? (do *not* update ref_pos until end)
+ md_pos = 0
+ last = None
+ while md and md_pos < length:
+ if last == (op, length, md):
+ sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length))
+ sys.exit(1)
+ last = (op, length, md)
+ # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
+ chunk_size, md = _extract_md_matches(md, length - md_pos)
+ # sys.stderr.write(' -> %s, %s\n' %(chunk_size, md))
+ md_pos += chunk_size
+
+ # look for mismatches
+ while md_pos < length and md and md[0] not in '0123456789^':
+ yield (op, ref_pos + md_pos, seq[read_pos + md_pos])
+ md = md[1:]
+
+ md_pos += 1
+
+ ref_pos += length
+ read_pos += length
+
+ elif op == 1: # I
+ # nothing in MD about inserts...
+ yield (op, ref_pos, seq[read_pos:read_pos + length])
+ read_pos += length
+
+ elif op == 2: # D
+ # prefixed with '^' and includes all of the removed bases
+ if md[0] == '^':
+ md = md[1:]
+ yield (op, ref_pos, md[:length])
+ md = md[length:]
+ ref_pos += length
+
+ elif op == 3: # N
+ ref_pos += length
+
+
+def read_calc_mismatches_gen(ref, read, chrom):
+ start = read.pos
+ ref_pos = 0
+ read_pos = 0
+
+ for op, length in read.cigar:
+ if op == 1:
+ yield ref_pos, op, None
+ read_pos += length
+ elif op == 2:
+ yield ref_pos, op, None
+ ref_pos += length
+ elif op == 3:
+ ref_pos += length
+ elif op == 0:
+ refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length)
+ if not refseq:
+ raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename))
+ cur_pos = start + ref_pos
+ for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()):
+ if refbase != readbase:
+ yield op, cur_pos, readbase
+ cur_pos += 1
+ ref_pos += length
+ read_pos += length
+ else:
+ raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+
+def read_calc_mismatches_ref(ref, read, chrom):
+ edits = 0
+
+ for op, pos, base in read_calc_mismatches_gen(ref, read, chrom):
+ edits += 1
+
+ return edits
+
+
+__region_cache = {}
+
+
+def region_pos_to_genomic_pos(name, start, cigar):
+ '''
+ converts a junction position to a genomic location given a junction
+ ref name, the junction position, and the cigar alignment.
+
+ returns: (genomic ref, genomic pos, genomic cigar)
+
+ >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)])
+ ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)])
+
+ >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)])
+ ('chr1', 1025, [(0, 25), (3, 0), (0, 75)])
+
+ >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)])
+ ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)])
+
+ >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)])
+ ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)])
+
+ >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
+ ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
+
+ >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
+ ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
+
+ >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S'))
+ ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)])
+
+
+ '''
+
+ if name in __region_cache:
+ chrom, fragments = __region_cache[name]
+ else:
+ c1 = name.split(':')
+ chrom = c1[0]
+
+ fragments = []
+ for fragment in c1[1].split(','):
+ s, e = fragment.split('-')
+ fragments.append((int(s), int(e)))
+
+ __region_cache[name] = (chrom, fragments)
+
+ chr_cigar = []
+ chr_start = fragments[0][0]
+
+ read_start = int(start)
+
+ frag_idx = 0
+ frag_start = 0
+ frag_end = 0
+
+ for i, (s, e) in enumerate(fragments):
+ if chr_start + read_start < e:
+ chr_start += read_start
+ frag_idx = i
+ frag_start = s
+ frag_end = e
+ break
+
+ else:
+ chr_start += (e - s)
+ read_start -= (e - s)
+
+ cur_pos = chr_start
+
+ for op, length in cigar:
+ if op in [1, 4, 5]:
+ chr_cigar.append((op, length))
+
+ elif op in [0, 2]:
+ if cur_pos + length <= frag_end:
+ cur_pos += length
+ chr_cigar.append((op, length))
+
+ else:
+ while cur_pos + length > frag_end:
+ if frag_end - cur_pos > 0:
+ chr_cigar.append((op, frag_end - cur_pos))
+ length -= (frag_end - cur_pos)
+ cur_pos = frag_end
+ frag_idx += 1
+ if len(fragments) <= frag_idx:
+ print 'ERROR converting: ', name, fragments
+ return (chrom, 0, chr_cigar)
+ frag_start, frag_end = fragments[frag_idx]
+ chr_cigar.append((3, frag_start - cur_pos))
+ cur_pos = frag_start
+
+ cur_pos = cur_pos + length
+ chr_cigar.append((op, length))
+ else:
+ print "Unsupported CIGAR operation (%s)" % bam_cigar[op]
+ sys.exit(1)
+
+ return (chrom, chr_start, chr_cigar)
+
+
+def is_junction_valid(cigar, min_overlap=4):
+ '''
+ Does the genomic cigar alignment represent a 'good' alignment.
+ Used for checking junction->genome alignments
+
+ 1) the alignment must not start at a splice junction
+ 2) the alignment must not start or end with an overhang
+ 3) the alignment must overhang the splice junction by min_overlap (4)
+
+ | Exon1 | Intron | Exon2 |
+ |-----------------|oooooooooooooooo|------------------|
+ XXXXXXXXXXXXXXXXXXXXXXXX (bad 1)
+ XXXXXXXXXXXXX (bad 2) XXXXXXXXXXXXXXXX (bad 2)
+ XX-----------------XXXXXXXXXXXXXXXXX (bad 3)
+
+ >>> is_junction_valid(cigar_fromstr('1000N40M'))
+ (False, 'Starts at gap (1000N40M)')
+
+ >>> is_junction_valid(cigar_fromstr('100M'))
+ (False, "Doesn't cover junction")
+
+ >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4)
+ (False, "Too short overlap at 3' (100M1000N3M)")
+
+ >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4)
+ (False, "Too short overlap at 5' (2M1000N100M)")
+
+ >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4)
+ (True, '')
+
+ >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4)
+ (True, '')
+
+ >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4)
+ (True, '')
+
+ '''
+ first = True
+ pre_gap = True
+
+ pre_gap_count = 0
+ post_gap_count = 0
+
+ has_gap = False
+
+ for op, length in cigar:
+ # mapping can't start at a gap
+ if first and op == 3:
+ return (False, 'Starts at gap (%s)' % cigar_tostr(cigar))
+ first = False
+
+ if op == 3:
+ pre_gap = False
+ post_gap_count = 0
+ has_gap = True
+
+ elif pre_gap:
+ pre_gap_count += length
+ else:
+ post_gap_count += length
+
+ # mapping must start with more than min_overlap base match
+
+ if not has_gap:
+ return (False, "Doesn't cover junction")
+ elif pre_gap_count < min_overlap:
+ return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar))
+ elif post_gap_count < min_overlap:
+ return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar))
+
+ return True, ''
+
+
+def read_alignment_fragments_gen(read):
+ '''
+ Takes a read and returns the start and end positions for each match/mismatch region.
+ This will let us know where each read alignment "touches" the genome.
+ '''
+
+ for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):
+ yield (start, end)
+
+
+def _read_alignment_fragments_gen(pos, cigar):
+ ''' Test-able version of read_alignment_fragments_gen
+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M')))
+ [(1, 51)]
+
+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M')))
+ [(1, 26), (126, 151)]
+
+ >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M')))
+ [(1, 21), (22, 26), (126, 136), (136, 146)]
+ '''
+ ref_pos = pos
+
+ for op, length in cigar:
+ if op == 0:
+ yield (ref_pos, ref_pos + length)
+ ref_pos += length
+ elif op == 1:
+ pass
+ elif op == 2:
+ ref_pos += length
+ elif op == 3:
+ ref_pos += length
+ else:
+ raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+
+def read_cigar_at_pos(cigar, qpos, is_del):
+ '''
+ Returns the CIGAR operation for a given read position
+
+ qpos is the 0-based index of the base in the read
+ '''
+ pos = 0
+ returnnext = False
+ for op, length in cigar:
+ if returnnext:
+ return op
+ if op == 0:
+ pos += length
+ elif op == 1:
+ pos += length
+ elif op == 2:
+ pass
+ elif op == 3:
+ pass
+ elif op == 4:
+ pos += length
+ elif op == 5:
+ pass
+ else:
+ raise ValueError("Unsupported CIGAR operation: %s" % op)
+
+ if pos > qpos:
+ return op
+ if is_del and pos == qpos:
+ returnnext = True
+
+ return None
+
+
+def cleancigar(cigar):
+ '''
+ Cleans a CIGAR alignment to remove zero length operations
+
+ >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S')))
+ '83M18S'
+
+ '''
+ newcigar = []
+
+ changed = False
+ zero = False
+
+ for op, size in cigar:
+ if size > 0:
+ if zero and newcigar:
+ if newcigar[-1][0] == op:
+ newsize = newcigar[-1][1] + size
+ newcigar[-1] = (op, newsize)
+ zero = 0
+ else:
+ newcigar.append((op, size))
+ else:
+ changed = True
+ zero = True
+
+ if changed:
+ return newcigar
+
+ return None
+
+
+def read_cleancigar(read):
+ '''
+ Replaces the CIGAR string for a read to remove any operations that are zero length.
+
+ This happens to the read in-place
+ '''
+ if read.is_unmapped:
+ return False
+
+ newcigar = []
+
+ newcigar = cleancigar(read.cigar)
+
+ if newcigar:
+ read.cigar = newcigar
+ return True
+
+ return False
+
+
+def read_to_unmapped(read, ref=None):
+ '''
+ Converts a read from mapped to unmapped.
+
+ Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed)
+ '''
+
+ newread = pysam.AlignedRead()
+
+ if ref:
+ tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))]
+
+ newread.is_unmapped = True
+ newread.mapq = 0
+ newread.tlen = 0
+ newread.pos = -1
+ newread.pnext = -1
+ newread.rnext = -1
+ newread.tid = -1
+
+ newread.qname = read.qname
+
+ if read.is_paired:
+ newread.is_paired = True
+
+ if not read.is_unmapped and read.is_reverse:
+ newread.seq = ngsutils.support.revcomp(read.seq)
+ newread.qual = read.qual[::-1]
+ else:
+ newread.seq = read.seq
+ newread.qual = read.qual
+
+ newread.tags = tags
+
+ return newread
+
+
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.pyc
Binary file ngsutils/bam/__init__.pyc has changed
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/bed/__init__.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,282 @@
+import os
+import ngsutils.support.ngs_utils
+import pysam
+
+
+class BedStreamer(object):
+ '''
+ Streams BedRegions from a BED file
+
+ Note - this can only be used once! There is no mechanism to seek the stream.
+ '''
+
+ def __init__(self, fname=None, fileobj=None, quiet=False):
+ if not fname and not fileobj:
+ raise ValueError("You must specify either fname or fileobj!")
+
+ self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ try:
+ while True:
+ line = self.reader.next().strip()
+ if line and line[0] != '#':
+ cols = line.split('\t')
+ while len(cols) < 6:
+ cols.append('')
+
+ return BedRegion(*cols)
+ except:
+ raise StopIteration
+
+
+
+class BedFile(object):
+ '''
+ BED files are read in their entirety memory, in a series of bins. Each bin
+ is ~100kb in size. Each bin can then be iterated over.
+
+ This is less efficient than using a proper index, but in reality, this
+ usually isn't an issue. However, if the BED file has been Tabix indexed,
+ that index will be used for random access.
+
+ NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile
+ and a BedFileIndex where the file is indexed only if random access is requested.
+ '''
+
+ _bin_const = 100000
+
+ def __init__(self, fname=None, fileobj=None, region=None):
+ self._bins = {}
+ self._bin_list = []
+ self._cur_bin_idx = 0
+ self._cur_bin_pos = 0
+ self._tellpos = 0
+ self._total = 0
+ self._length = 0
+ self.__tabix = None
+
+ self.filename = fname
+
+ if os.path.exists('%s.tbi' % fname):
+ self.__tabix = pysam.Tabixfile(fname)
+
+ if fileobj:
+ self.__readfile(fileobj)
+ elif fname:
+ with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:
+ self.__readfile(fobj)
+ elif region:
+ chrom, startend = region.split(':')
+ if '-' in startend:
+ start, end = [int(x) for x in startend.split('-')]
+ else:
+ start = int(startend)
+ end = start
+ start -= 1
+
+ self.__add_region(BedRegion(chrom, start, end))
+ else:
+ raise ValueError("Must specify either filename, fileobj, or region")
+
+ def __readfile(self, fobj):
+ for line in fobj:
+ line = line.strip()
+ if line and line[0] != '#':
+ cols = line.split('\t')
+ while len(cols) < 6:
+ cols.append('')
+
+ region = BedRegion(*cols)
+ self.__add_region(region)
+
+ self._bin_list.sort()
+ for bin in self._bins:
+ self._bins[bin].sort()
+
+ def __add_region(self, region):
+ self._total += region.end - region.start
+ self._length += 1
+
+ startbin = region.start / BedFile._bin_const
+ endbin = region.end / BedFile._bin_const
+
+ for bin in xrange(startbin, endbin + 1):
+ if not (region.chrom, bin) in self._bins:
+ self._bin_list.append((region.chrom, bin))
+ self._bins[(region.chrom, bin)] = []
+ self._bins[(region.chrom, bin)].append(region)
+
+ def fetch(self, chrom, start, end, strand=None):
+ '''
+ For TABIX indexed BED files, find all regions w/in a range
+
+ For non-TABIX index BED files, use the calculated bins, and
+ output matching regions
+ '''
+
+ if self.__tabix:
+ for match in self.__tabix.fetch(chrom, start, end):
+ region = BedRegion(*match.split('\t'))
+ if not strand or (strand and region.strand == strand):
+ yield region
+ else:
+ startbin = start / BedFile._bin_const
+ endbin = end / BedFile._bin_const
+
+ buf = set()
+
+ for bin in xrange(startbin, endbin + 1):
+ if (chrom, bin) in self._bins:
+ for region in self._bins[(chrom, bin)]:
+ if strand and strand != region.strand:
+ continue
+ if start <= region.start <= end or start <= region.end <= end:
+ if not region in buf:
+ yield region
+ buf.add(region)
+ elif region.start < start and region.end > end:
+ if not region in buf:
+ yield region
+ buf.add(region)
+
+ def tell(self):
+ return self._tellpos
+
+ def close(self):
+ pass
+
+ @property
+ def length(self):
+ return self._length
+
+ @property
+ def total(self):
+ return self._total
+
+ def __iter__(self):
+ self._cur_bin_idx = 0
+ self._cur_bin_pos = 0
+ self._tellpos = 0
+ return self
+
+ def next(self):
+ if self._cur_bin_idx >= len(self._bin_list):
+ raise StopIteration
+
+ binvals = self._bins[self._bin_list[self._cur_bin_idx]]
+ while self._cur_bin_pos < len(binvals):
+ val = binvals[self._cur_bin_pos]
+ self._cur_bin_pos += 1
+
+ startbin = (val.chrom, val.start / BedFile._bin_const)
+ if startbin == self._bin_list[self._cur_bin_idx]:
+ self._tellpos += 1
+ return val
+
+ self._cur_bin_idx += 1
+ self._cur_bin_pos = 0
+ return self.next()
+
+
+class BedRegion(object):
+ def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args):
+ self.chrom = chrom
+ self.start = int(start)
+ self.end = int(end)
+ self.name = name
+
+ if score == '':
+ self.score = 0
+ else:
+ self.score = float(score)
+
+ if strand == '':
+ self.strand = None
+ else:
+ self.strand = strand
+
+ if thickStart == '':
+ self.thickStart = None
+ else:
+ self.thickStart = thickStart
+
+ if thickEnd == '':
+ self.thickEnd = None
+ else:
+ self.thickEnd = thickEnd
+
+ if rgb == '':
+ self.rgb = None
+ else:
+ self.rgb = rgb
+
+ self.extras = args
+
+ def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args):
+ cols = []
+ cols.append(self.chrom if chrom is None else chrom)
+ cols.append(self.start if start is None else start)
+ cols.append(self.end if end is None else end)
+ cols.append(self.name if name is None else name)
+ cols.append(self.score if score is None else score)
+ cols.append(self.strand if strand is None else strand)
+ cols.append(self.thickStart if thickStart is None else thickStart)
+ cols.append(self.thickEnd if thickEnd is None else thickEnd)
+ cols.append(self.rgb if rgb is None else rgb)
+
+ for i, val in enumerate(self.extras):
+ if len(args) > i:
+ cols.append(args[i])
+ else:
+ cols.append(val)
+
+ return BedRegion(*cols)
+
+ @property
+ def score_int(self):
+ score = str(self.score)
+ if score[-2:] == '.0':
+ score = score[:-2]
+
+ return score
+
+ def __key(self):
+ return (self.chrom, self.start, self.end, self.strand, self.name)
+
+ def __lt__(self, other):
+ return self.__key() < other.__key()
+
+ def __gt__(self, other):
+ return self.__key() > other.__key()
+
+ def __eq__(self, other):
+ return self.__key() == other.__key()
+
+ def write(self, out):
+ out.write('%s\n' % self)
+
+ def __repr__(self):
+ outcols = []
+
+ if self.rgb:
+ outcols.append(self.rgb)
+ if self.thickEnd or outcols:
+ outcols.append(self.thickEnd if self.thickEnd else self.end)
+ if self.thickStart or outcols:
+ outcols.append(self.thickStart if self.thickStart else self.start)
+ if self.strand or outcols:
+ outcols.append(self.strand)
+ if self.score_int != '' or outcols:
+ outcols.append(self.score_int)
+ if self.name or outcols:
+ outcols.append(self.name)
+
+ outcols.append(self.end)
+ outcols.append(self.start)
+ outcols.append(self.chrom)
+
+ return '\t'. join([str(x) for x in outcols[::-1]])
diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.pyc
Binary file ngsutils/bed/__init__.pyc has changed
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/__init__.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,249 @@
+import collections
+import gzip
+import os
+import sys
+import re
+try:
+ from eta import ETA
+except:
+ pass
+
+class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')):
+ def __repr__(self):
+ if self.comment:
+ return '>%s %s\n%s\n' % (self.name, self.comment, self.seq)
+ return '>%s\n%s\n' % (self.name, self.seq)
+
+ def subseq(self, start, end, comment=None):
+ if self.comment:
+ comment = '%s %s' % (self.comment, comment)
+
+ return FASTARead(self.name, comment, self.seq[start:end])
+
+ def clone(self, name=None, comment=None, seq=None):
+ n = name if name else self.name
+ c = comment if comment else self.comment
+ s = seq if seq else self.seq
+
+ return FASTARead(n, c, s)
+
+ def write(self, out):
+ out.write(repr(self))
+
+
+class FASTA(object):
+ def __init__(self, fname=None, fileobj=None, qual=False):
+ self.fname = fname
+ self.qual = qual
+ if fileobj:
+ self.fileobj = fileobj
+ else:
+ if self.fname == '-':
+ self.fileobj = sys.stdin
+ elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz':
+ self.fileobj = gzip.open(os.path.expanduser(self.fname))
+ else:
+ self.fileobj = open(os.path.expanduser(self.fname))
+
+ if not self.fileobj:
+ raise ValueError("Missing valid filename or fileobj")
+
+ def close(self):
+ if self.fileobj != sys.stdout:
+ self.fileobj.close()
+
+ def tell(self):
+ # always relative to uncompressed...
+ return self.fileobj.tell()
+
+ def seek(self, pos, whence=0):
+ self.fileobj.seek(pos, whence)
+
+ def fetch(self, quiet=False):
+ name = ''
+ comment = ''
+ seq = ''
+
+ if not quiet and self.fname and self.fname != '-':
+ eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
+ else:
+ eta = None
+
+ for line in self.fileobj:
+ line = line.strip()
+ if not line:
+ continue
+ if line[0] == '#':
+ continue
+
+ if line[0] == '>':
+ if name and seq:
+ if eta:
+ eta.print_status(extra=name)
+ yield FASTARead(name, comment, seq)
+
+ spl = re.split(r'[ \t]', line[1:], maxsplit=1)
+ name = spl[0]
+ if len(spl) > 1:
+ comment = spl[1]
+ else:
+ comment = ''
+ seq = ''
+
+ else:
+ if self.qual:
+ seq = seq + ' ' + line
+ else:
+ seq += line
+
+ if name and seq:
+ if eta:
+ eta.print_status(extra=name)
+ yield FASTARead(name, comment, seq)
+
+ if eta:
+ eta.done()
+
+
+def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None):
+ if fileobj:
+ f = fileobj
+ elif fname == '-':
+ f = sys.stdin
+ elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+ f = gzip.open(os.path.expanduser(fname))
+ else:
+ f = open(os.path.expanduser(fname))
+
+ if quiet or fname == '-':
+ eta = None
+ else:
+ eta = ETA(os.stat(fname).st_size, fileobj=f)
+
+ for line in f:
+ if eta:
+ if callback:
+ extra = callback()
+ else:
+ extra = ''
+
+ eta.print_status(extra=extra)
+ yield line
+
+ if done_callback and done_callback():
+ break
+
+ if f != sys.stdin:
+ f.close()
+
+ if eta:
+ eta.done()
+
+
+class Symbolize(object):
+ 'Converts strings to symbols - basically a cache of strings'
+ def __init__(self):
+ self.__cache = {}
+
+ def __getitem__(self, k):
+ if not k in self.__cache:
+ self.__cache[k] = k
+
+ return self.__cache[k]
+
+symbols = Symbolize()
+
+_compliments = {
+'a': 't',
+'A': 'T',
+'c': 'g',
+'C': 'G',
+'g': 'c',
+'G': 'C',
+'t': 'a',
+'T': 'A',
+'n': 'n',
+'N': 'N'
+}
+
+
+def revcomp(seq):
+ '''
+ >>> revcomp('ATCGatcg')
+ 'cgatCGAT'
+ '''
+ ret = []
+
+ for s in seq:
+ ret.append(_compliments[s])
+
+ ret.reverse()
+ return ''.join(ret)
+
+
+class Counts(object):
+ '''
+ Setup simple binning. Bins are continuous 0->max. Values are added to
+ bins and then means / distributions can be calculated.
+ '''
+ def __init__(self):
+ self.bins = []
+
+ def add(self, val):
+ while len(self.bins) <= val:
+ self.bins.append(0)
+ self.bins[val] += 1
+
+ def mean(self):
+ acc = 0
+ count = 0
+
+ for i, val in enumerate(self.bins):
+ acc += (i * val)
+ count += val
+
+ if count > 0:
+ return float(acc) / count
+
+ def max(self):
+ return len(self.bins) - 1
+
+
+def memoize(func):
+ if 'TESTING' in os.environ or 'DEBUG' in os.environ:
+ return func
+
+ __cache = {}
+ def inner(*args, **kwargs):
+ k = (args, tuple(kwargs.iteritems()))
+ if k not in __cache:
+ __cache[k] = func(*args, **kwargs)
+ return __cache[k]
+
+ inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__)
+ return inner
+
+
+def quoted_split(s, delim, quote_char='"'):
+ tokens = []
+
+ buf = ""
+ inquote = False
+
+ for c in s:
+ if inquote:
+ buf += c
+ if c == quote_char:
+ inquote = False
+ elif c == delim:
+ tokens.append(buf)
+ buf = ""
+ else:
+ buf += c
+ if c == quote_char:
+ inquote = True
+
+ if buf:
+ tokens.append(buf)
+
+ return tokens
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.pyc
Binary file ngsutils/support/__init__.pyc has changed
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/bgzip.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/bgzip.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+'''
+Extract the blocks from a BGZip file.
+
+BAM files are stored as blocks in a bgzip archive. This class
+will load the bgzip archive and output the block information.
+'''
+
+import sys
+import os
+import struct
+
+
+class BGZip(object):
+ def __init__(self, fname):
+ self.fname = fname
+ self.pos = 0
+ self.fileobj = open(self.fname)
+ self.fsize = os.stat(self.fname).st_size
+ self.cur_chunk = 0
+
+ self.cpos = 0
+ self.cdata = 0
+
+ def close(self):
+ self.fileobj.close()
+
+ def seek(self, offset):
+ bgz_offset = offset >> 16
+ chunk_offset = offset & 0xFFFF
+
+ self.fileobj.seek(bgz_offset)
+ self.read_chunk()
+ self.chunk_pos = chunk_offset
+
+ def read(self, amount, whence=0):
+ if whence not in [0, 1]:
+ print "Bad Whence value!: %s" % whence
+ return
+
+ if whence == 0:
+ self.seek(0, 0)
+
+ ### read into chunk, if not enough data in chunk, read next chunk
+ ret = ''
+ while amount and self.pos < self.fsize:
+ if len(self.cdata) - self.cpos < amount:
+ ret += self.cdata[self.cpos:self.cpos + amount]
+ self.cpos += amount
+ return ret
+ else:
+ ret += self.cdata[self.cpos:]
+ amount = amount - len(ret)
+ self.read_chunk()
+ return ret
+
+ def read_chunk(self):
+ self.fileobj.seek(10)
+ id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('', op, chrom, pos, base
+ print snp
+ print snp.alleles
+ if exit:
+ sys.exit(1)
+
+ def is_valid_variation(self, chrom, op, pos, seq, verbose=False):
+ for snp in self.fetch(chrom, pos):
+ if not '/' in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']:
+ # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc)
+ continue
+
+ if op == 0:
+ if snp.clazz in ['single', 'mixed'] and seq in snp.alleles:
+ return True
+ # elif verbose:
+ # for alt in snp.alleles:
+ # if len(alt) > 1:
+ # self.dump(chrom, op, pos, seq, snp, False)
+
+ elif op == 1:
+ if snp.clazz in ['insertion', 'mixed', 'in-del']:
+ if seq in snp.alleles:
+ if len(seq) > 1 and verbose:
+ self.dump(chrom, op, pos, seq, snp, False)
+ return True
+
+ if verbose:
+ if len(seq) > 1:
+ self.dump(chrom, op, pos, seq, snp, False)
+ else:
+ for alt in snp.alleles:
+ if len(alt) > 1:
+ self.dump(chrom, op, pos, seq, snp, False)
+
+ elif op == 2:
+ if snp.clazz in ['deletion', 'mixed', 'in-del']:
+ if '-' in snp.alleles and seq in snp.alleles:
+ if len(seq) > 1 and verbose:
+ self.dump(chrom, op, pos, seq, snp, False)
+ return True
+
+ return False
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.pyc
Binary file ngsutils/support/dbsnp.pyc has changed
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/llh.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/llh.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,55 @@
+'''
+Methods for calculating log-likelihoods for nucleotide frequencies
+'''
+import math
+import collections
+from ngsutils.support import memoize
+
+_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2}
+
+NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo')
+
+@memoize
+def pseudo_count(N, bg):
+ '''
+ >>> pseudo_count(100, _default_background['A'])
+ 3
+ >>> pseudo_count(100, _default_background['C'])
+ 2
+ '''
+
+ return bg * math.sqrt(N)
+
+
+def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'):
+ if pseudo == 'auto':
+ N = A + C + G + T
+ Ap = float(A) + pseudo_count(N, bg['A'])
+ Cp = float(C) + pseudo_count(N, bg['C'])
+ Gp = float(G) + pseudo_count(N, bg['G'])
+ Tp = float(T) + pseudo_count(N, bg['T'])
+ elif pseudo:
+ Ap = float(A) + pseudo
+ Cp = float(C) + pseudo
+ Gp = float(G) + pseudo
+ Tp = float(T) + pseudo
+ else:
+ Ap = float(A)
+ Cp = float(C)
+ Gp = float(G)
+ Tp = float(T)
+
+ Np = Ap + Cp + Gp + Tp
+
+ freqA = Ap / Np
+ freqC = Cp / Np
+ freqG = Gp / Np
+ freqT = Tp / Np
+
+ return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo)
+
+
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/ngs_utils.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+"""
+
+Common util classes / functions for the NGS project
+
+"""
+import sys
+import os
+import gzip
+import re
+import collections
+
+
+def format_number(n):
+ '''
+ >>> format_number(1000)
+ '1,000'
+ >>> format_number(1234567)
+ '1,234,567'
+ '''
+ ar = list(str(n))
+ for i in range(len(ar))[::-3][1:]:
+ ar.insert(i + 1, ',')
+ return ''.join(ar)
+
+
+def natural_sort(ar):
+ '''
+ >>> natural_sort('1 3 4 2 5'.split())
+ ['1', '2', '3', '4', '5']
+ >>> natural_sort('1 10 20 2 3 4'.split())
+ ['1', '2', '3', '4', '10', '20']
+ '''
+ to_sort = []
+ for item in ar:
+ spl = re.split('(\d+)', item)
+ l2 = []
+ for el in spl:
+ try:
+ n = int(el)
+ except:
+ n = el
+ l2.append(n)
+ to_sort.append((l2, item))
+
+ to_sort.sort()
+ return [x[1] for x in to_sort]
+
+
+def dictify(values, colnames):
+ """
+ Convert a list of values into a dictionary based upon given column names.
+
+ If the column name starts with an '@', the value is assumed to be a comma
+ separated list.
+
+ If the name starts with a '#', the value is assumed to be an int.
+
+ If the name starts with '@#', the value is assumed to a comma separated
+ list of ints.
+
+ """
+ d = {}
+ for i in xrange(len(colnames)):
+ key = colnames[i]
+ split = False
+ num = False
+
+ if key[0] == '@':
+ key = key[1:]
+ split = True
+ if key[0] == '#':
+ key = key[1:]
+ num = True
+
+ if i < len(values):
+ if num and split:
+ val = [int(x) for x in values[i].rstrip(',').split(',')]
+ elif num:
+ val = int(values[i])
+ elif split:
+ val = values[i].rstrip(',').split(',')
+ else:
+ val = values[i]
+
+ d[key] = val
+
+ else:
+ d[key] = None
+
+ return d
+
+
+def gzip_aware_open(fname):
+ if fname == '-':
+ f = sys.stdin
+ elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
+ f = gzip.open(os.path.expanduser(fname))
+ else:
+ f = open(os.path.expanduser(fname))
+ return f
+
+
+class gzip_opener:
+ '''
+ A Python 2.6 class to handle 'with' opening of text files that may
+ or may not be gzip compressed.
+ '''
+ def __init__(self, fname):
+ self.fname = fname
+
+ def __enter__(self):
+ self.f = gzip_aware_open(self.fname)
+ return self.f
+
+ def __exit__(self, type, value, traceback):
+ if self.f != sys.stdin:
+ self.f.close()
+ return False
+
+
+def filenames_to_uniq(names, new_delim='.'):
+ '''
+ Given a set of file names, produce a list of names consisting of the
+ uniq parts of the names. This works from the end of the name. Chunks of
+ the name are split on '.' and '-'.
+
+ For example:
+ A.foo.bar.txt
+ B.foo.bar.txt
+ returns: ['A','B']
+
+ AA.BB.foo.txt
+ CC.foo.txt
+ returns: ['AA.BB','CC']
+
+ >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split())
+ ['a', 'b']
+ >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split())
+ ['a.b', 'c']
+
+ '''
+ name_words = []
+ maxlen = 0
+ for name in names:
+ name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split())
+ name_words[-1].reverse()
+ if len(name_words[-1]) > maxlen:
+ maxlen = len(name_words[-1])
+
+ common = [False, ] * maxlen
+ for i in xrange(maxlen):
+ last = None
+ same = True
+ for nameword in name_words:
+ if i >= len(nameword):
+ same = False
+ break
+ if not last:
+ last = nameword[i]
+ elif nameword[i] != last:
+ same = False
+ break
+ common[i] = same
+
+ newnames = []
+ for nameword in name_words:
+ nn = []
+ for (i, val) in enumerate(common):
+ if not val and i < len(nameword):
+ nn.append(nameword[i])
+ nn.reverse()
+ newnames.append(new_delim.join(nn))
+ return newnames
+
+
+def parse_args(argv, defaults=None, expected_argc=0):
+ opts = {}
+ if defaults:
+ opts.update(defaults)
+
+ args = []
+
+ i = 0
+ while i < len(argv):
+ if argv[i][0] == '-':
+ arg = argv[i].lstrip('-')
+ if '=' in arg:
+ k, v = arg.split('=', 2)
+ if k in defaults:
+ if type(defaults[k]) == float:
+ opts[k] = float(v)
+ elif type(defaults[k]) == int:
+ opts[k] = int(v)
+ else:
+ opts[k] = v
+ else:
+ opts[arg] = True
+ else:
+ args.append(argv[i])
+ i += 1
+
+ while len(args) < expected_argc:
+ args.append(None)
+ return opts, args
+
+
+class memoize(object):
+ 'Simple memoizing decorator to cache results'
+ def __init__(self, func):
+ self.func = func
+ self.cache = {}
+
+ def __call__(self, *args):
+ if not isinstance(args, collections.Hashable):
+ # uncacheable. a list, for instance.
+ # better to not cache than blow up.
+ return self.func(*args)
+
+ if args in self.cache:
+ return self.cache[args]
+ else:
+ value = self.func(*args)
+ self.cache[args] = value
+ return value
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.pyc
Binary file ngsutils/support/ngs_utils.pyc has changed
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/regions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/regions.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,166 @@
+class RangeMatch(object):
+ '''
+ Simple genomic ranges. You can define chrom:start-end ranges, then ask if a
+ particular genomic coordinate maps to any of those ranges. This is less-
+ efficient than an R-Tree, but easier to code.
+ '''
+ def __init__(self, name):
+ self.ranges = {}
+ self.name = name
+
+ def add_range(self, chrom, strand, start, end):
+ if not chrom in self.ranges:
+ self.ranges[chrom] = {}
+
+ bin = start / 100000
+ if not bin in self.ranges[chrom]:
+ self.ranges[chrom][bin] = []
+ self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+ if (end / 100000) != bin:
+ for bin in xrange(bin + 1, (end / 100000) + 1):
+ if not bin in self.ranges[chrom]:
+ self.ranges[chrom][bin] = []
+ self.ranges[chrom][bin].insert(0, (start, end, strand))
+
+ def get_tag(self, chrom, strand, pos, ignore_strand=False):
+ '''
+ returns (region, is_reverse_orientation)
+ '''
+ if not chrom in self.ranges:
+ return None, False
+ bin = pos / 100000
+ if not bin in self.ranges[chrom]:
+ return None, False
+ for start, end, r_strand in self.ranges[chrom][bin]:
+ if pos >= start and pos <= end:
+ if ignore_strand or strand == r_strand:
+ return self.name, False
+ return self.name, True
+ return None, False
+
+
+class RegionTagger(object):
+ def __init__(self, gtf, valid_chroms=None, only_first_fragment=True):
+ self.regions = []
+ self.counts = {}
+ self.only_first_fragment = only_first_fragment
+
+ coding = RangeMatch('coding')
+ exons = RangeMatch('other-exon')
+ utr_5 = RangeMatch('utr-5')
+ utr_3 = RangeMatch('utr-3')
+ introns = RangeMatch('intron')
+ promoters = RangeMatch('promoter')
+
+ for gene in gtf.genes:
+ if valid_chroms and not gene.chrom in valid_chroms:
+ continue
+ if gene.strand == '+':
+ promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start)
+ else:
+ promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000)
+
+ for transcript in gene.transcripts:
+ if transcript.has_cds:
+ for start, end in transcript.cds:
+ coding.add_range(gene.chrom, gene.strand, start, end)
+
+ # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS
+ for s, e in transcript.utr_5:
+ utr_5.add_range(gene.chrom, gene.strand, s, e)
+ for s, e in transcript.utr_3:
+ utr_3.add_range(gene.chrom, gene.strand, s, e)
+
+ last_end = None
+ for start, end in transcript.exons:
+ if last_end:
+ introns.add_range(gene.chrom, gene.strand, last_end, start)
+ exons.add_range(gene.chrom, gene.strand, start, end)
+ last_end = end
+
+
+ self.regions.append(coding)
+ self.regions.append(utr_5)
+ self.regions.append(utr_3)
+ self.regions.append(exons)
+ self.regions.append(introns)
+ self.regions.append(promoters)
+
+ self.counts['coding'] = 0
+ self.counts['coding-rev'] = 0
+ self.counts['other-exon'] = 0
+ self.counts['utr-5'] = 0
+ self.counts['utr-3'] = 0
+ self.counts['utr-5-rev'] = 0
+ self.counts['utr-3-rev'] = 0
+ self.counts['intron'] = 0
+ self.counts['promoter'] = 0
+ self.counts['other-exon-rev'] = 0
+ self.counts['intron-rev'] = 0
+ self.counts['promoter-rev'] = 0
+ self.counts['junction'] = 0
+ self.counts['intergenic'] = 0
+ self.counts['mitochondrial'] = 0
+
+ def add_read(self, read, chrom):
+ if read.is_unmapped:
+ return
+
+ if self.only_first_fragment and read.is_paired and not read.is_read1:
+ return
+
+ tag = None
+ is_rev = False
+
+ strand = '-' if read.is_reverse else '+'
+
+ if chrom == 'chrM':
+ tag = 'mitochondrial'
+
+ if not tag:
+ for op, length in read.cigar:
+ if op == 3:
+ tag = 'junction'
+ break
+
+ if not tag:
+ for region in self.regions:
+ tag, is_rev = region.get_tag(chrom, strand, read.pos)
+ if tag:
+ break
+
+ if not tag:
+ tag = 'intergenic'
+
+ if tag:
+ if is_rev:
+ self.counts['%s-rev' % tag] += 1
+ else:
+ self.counts[tag] += 1
+
+ return tag
+
+ def tag_region(self, chrom, start, end, strand):
+ tag = None
+ is_rev = False
+
+ if chrom == 'chrM' or chrom == 'M':
+ tag = 'mitochondrial'
+
+ if not tag:
+ for region in self.regions:
+ tag, is_rev = region.get_tag(chrom, strand, start)
+ if is_rev:
+ tag = '%s-rev' % tag
+
+ if start != end:
+ endtag, is_rev = region.get_tag(chrom, strand, end)
+ if is_rev:
+ endtag = '%s-rev' % endtag
+
+ if tag and endtag and endtag != tag:
+ tag = '%s/%s' % (tag, endtag)
+
+ if not tag:
+ tag = 'intergenic'
\ No newline at end of file
diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsutils/support/stats.py Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,161 @@
+'''
+various statistical tests and methods...
+'''
+import math
+from ngsutils.support import memoize
+
+def median(vals):
+ '''
+ >>> median([1,2,3])
+ 2
+ >>> median([1,2,3,4])
+ 2.5
+ '''
+ vals.sort()
+
+ if len(vals) % 2 == 1:
+ return vals[len(vals) / 2]
+ else:
+ a = vals[(len(vals) / 2) - 1]
+ b = vals[(len(vals) / 2)]
+ return float(a + b) / 2
+
+
+def mean_stdev(l):
+ '''
+ >>> mean_stdev([1,2,2,2])
+ (1.75, 0.5)
+ >>> mean_stdev([2,2,2,2])
+ (2.0, 0.0)
+ '''
+
+ acc = 0
+ for el in l:
+ acc += el
+
+ mean = float(acc) / len(l)
+ acc = 0
+ for el in l:
+ acc += (el - mean) ** 2
+
+ if len(l) > 2:
+ stdev = math.sqrt(float(acc) / (len(l) - 1))
+ else:
+ stdev = 0.0
+
+ return (mean, stdev)
+
+
+def counts_median(d):
+ '''
+ Calculate the median from counts stored in a dictionary
+ >>> counts_median({ 1: 4, 2: 1, 3: 4 })
+ 2
+ >>> counts_median({ 1: 4, 3: 4 })
+ 2
+
+ '''
+ count = 0
+ for k in d:
+ count += d[k]
+
+ if count == 0:
+ return 0
+
+ acc = 0.0
+ last = 0
+ for k in sorted(d):
+ if last:
+ return (last + k) / 2
+ acc += d[k]
+ if acc / count == 0.5:
+ last = k
+ elif acc / count > 0.5:
+ return k
+
+
+def counts_mean_stdev(d):
+ '''
+
+ calc mean / stdev when data is stored as counts in a dictionary
+
+ Ex:
+ { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3]
+
+ >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 })
+ (2.0, 1.0)
+
+ '''
+
+ acc = 0
+ count = 0
+ for k in d:
+ acc += k * d[k]
+ count += d[k]
+
+ mean = float(acc) / count
+
+ acc = 0
+ for k in d:
+ acc += (((k - mean) ** 2) * d[k])
+
+ if count > 2:
+ stdev = math.sqrt(float(acc) / (count - 1))
+ else:
+ stdev = 0.0
+
+ return (mean, stdev)
+
+@memoize
+def poisson_prob(x, mean):
+ '''
+ Return the probability that you could get x counts in
+ a Poisson test with a mean value.
+
+ prob(x) = sum(i=1..x){poisson(i)}
+
+ >>> poisson_prob(6,10)
+ 0.1300960209527205
+ >>> poisson_prob(8,10)
+ 0.33277427882095645
+ '''
+ acc = 0.0
+ for i in xrange(1, x+1):
+ acc += poisson_func(i, mean)
+ return acc
+
+@memoize
+def poisson_func(mu, lambd):
+ '''
+ This is the Poisson distribution function
+
+ p(mu) = (lambda^mu * e^(-lambda)) / (mu!)
+
+ mu is a count
+ lambd is the mean
+
+ >>> poisson_func(1,10)
+ 0.00045399929762484856
+ >>> poisson_func(2,10)
+ 0.0022699964881242427
+ >>> poisson_func(3,10)
+ 0.007566654960414142
+ '''
+ return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu)
+
+
+@memoize
+def _factorial(x):
+ '''
+ >>> _factorial(1)
+ 1
+ >>> _factorial(2)
+ 2
+ >>> _factorial(3)
+ 6
+ '''
+ return math.factorial(x)
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_input1.bam
Binary file test-data/ngsutils_bam_filter_input1.bam has changed
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result1.bam
Binary file test-data/ngsutils_bam_filter_result1.bam has changed
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result2.bam
Binary file test-data/ngsutils_bam_filter_result2.bam has changed
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result3.bam
Binary file test-data/ngsutils_bam_filter_result3.bam has changed
diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result4.bam
Binary file test-data/ngsutils_bam_filter_result4.bam has changed
diff -r 000000000000 -r 4e4e4093d65d tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Nov 11 13:04:07 2015 -0500
@@ -0,0 +1,6 @@
+
+
+
+
+
+