# HG changeset patch # User iuc # Date 1447265047 18000 # Node ID 4e4e4093d65d3c7377067f5d7499d454b228aca7 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068 diff -r 000000000000 -r 4e4e4093d65d bam_filter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bam_filter.xml Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,231 @@ + + Removes reads from a BAM file based on criteria + + macros.xml + + + + + + + + + -1: + -uniq + #if int($uniq) > 0: + $uniq + #end if + #end if + $uniq_start + #if $mismatch: + -mismatch $mismatch + #end if + $nosecondary + $noqcfail + $nopcrdup + #if $excludebed: + -excludebed "${excludebed}" $ignore_strand + #end if + #if $includebed: + -includebed "${includebed}" $ignore_strand + #end if + #if $includeref: + -includeref "${includeref}" + #end if + #if $excluderef: + -excluderef "${excluderef}" + #end if + #if $maximum_mismatch_ratio + -maximum_mismatch_ratio $maximum_mismatch_ratio + #end if + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 4e4e4093d65d filter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,986 @@ +#!/usr/bin/env python +## category General +## desc Removes reads from a BAM file based on criteria +""" +Removes reads from a BAM file based on criteria + +Given a BAM file, this script will only allow reads that meet filtering +criteria to be written to output. The output is another BAM file with the +reads not matching the criteria removed. + +Note: this does not adjust tag values reflecting any filtering. (for example: + if a read mapped to two locations (IH:i:2), and one was removed by + filtering, the IH:i tag would still read IH:i:2). + +Currently, the available filters are: + -minlen val Remove reads that are smaller than {val} + -maxlen val Remove reads that are larger than {val} + -mapped Keep only mapped reads + -unmapped Keep only unmapped reads + -properpair Keep only properly paired reads (both mapped, + correct orientation, flag set in BAM) + -noproperpair Keep only not-properly paired reads + + -mask bitmask Remove reads that match the mask (base 10/hex) + -uniq {length} Remove reads that are have the same sequence + Note: BAM file should be sorted + (up to an optional length) + -uniq_start Remove reads that start at the same position + Note: BAM file should be sorted + (Use only for low-coverage samples) + + -mismatch num # mismatches or indels + indel always counts as 1 regardless of length + (requires NM tag in reads) + + -mismatch_dbsnp num dbsnp.txt.bgz + # mismatches or indels - not in dbSNP. + Variations are called using the MD tag. + Variations that are found in the dbSNP list are + not counted as mismatches. The dbSNP list is a + Tabix-indexed dump of dbSNP (from UCSC Genome + Browser). Indels in dbSNP are also counted. + Adds a 'ZS:i' tag with the number of found SNPs + in the read. + (requires NM and MD tags) + + Example command for indexing: + ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0 + + -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches + directly in a reference FASTA file + (use if NM tag not present) + + -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz + # mismatches or indels - looks up mismatches + directly from a reference FASTA file. (See + -mismatch_dbsnp for dbSNP matching) + (use if NM or MD tag not present) + + -nosecondary Remove reads that have the 0x100 flag set + -noqcfail Remove reads that have the 0x200 flag set + -nopcrdup Remove reads that have the 0x400 flag set + + + -exclude ref:start-end Remove reads in this region (1-based start) + -excludebed file.bed {nostrand} + Remove reads that are in any of the regions + from the given BED file. If 'nostrand' is given, + strand information from the BED file is ignored. + + -include ref:start-end Remove reads NOT in the region (can only be one) + -includebed file.bed {nostrand} + Remove reads that are NOT any of the regions + from the given BED file. If 'nostrand' is given, + strand information from the BED file is ignored. + + Note: If this is a large dataset, use + "bamutils extract" instead. + + -includeref refname Exclude reads NOT mapped to a reference + -excluderef refname Exclude reads mapped to a particular reference + (e.g. chrM, or _dup chromosomes) + + -whitelist fname Remove reads that aren't on this list (by name) + -blacklist fname Remove reads that are on this list (by name) + These lists can be whitespace-delimited with + the read name as the first column. + -maximum_mismatch_ratio val + Filter by maximum mismatch ratio (fraction of length) + + -eq tag_name value + -lt tag_name value + -lte tag_name value + -gt tag_name value + -gte tag_name value + + As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ + value will be used. + +Common tags to filter by: + AS Alignment score + IH Number of alignments + NM Edit distance (each indel counts as many as its length) + + MAPQ Mapping quality (defined as part of SAM spec) + + The tag type (:i, :f, :Z) is optional. + +""" + +import os +import sys +import pysam +from ngsutils.bam import bam_iter +from ngsutils.support.dbsnp import DBSNP +from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations +from ngsutils.bed import BedFile + + +def usage(): + print __doc__ + print """ +Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria... + +Options: + -failed fname A text file containing the read names of all reads + that were removed with filtering + +Example: +bamutils filter filename.bam output.bam -mapped -gte AS:i 1000 + +This will remove all unmapped reads, as well as any reads that have an AS:i +value less than 1000. +""" + sys.exit(1) + + +class Unique(object): + def __init__(self, length=None): + if length: + self.length = int(length) + else: + self.length = None + + self.last_pos = None + self.pos_reads = set() + + def __repr__(self): + return "uniq" + + def filter(self, bam, read): + if self.last_pos != (read.tid, read.pos): + self.last_pos = (read.tid, read.pos) + self.pos_reads = set() + + if read.is_reverse: + seq = read.seq[::-1] # ignore revcomp for now, it isn't needed, just need to compare in the proper order + else: + seq = read.seq + + if self.length: + seq = seq[:self.length] + + if seq in self.pos_reads: + return False + + self.pos_reads.add(seq) + return True + + def close(self): + pass + + +class UniqueStart(object): + def __init__(self): + self.last_tid = None + self.last_fwd_pos = -1 + self.rev_pos_list = None + + def __repr__(self): + return "uniq_start" + + def filter(self, bam, read): + if self.last_tid is None or self.last_tid != read.tid: + self.last_tid = read.tid + self.rev_pos = set() + self.last_fwd_pos = -1 + self.last_rev_pos = -1 + + if read.is_reverse: + # check reverse reads from their start (3' aend) + # these aren't necessarily in the correct + # order in the file, so we have to track them in a set + + start_pos = read.aend + + # clean up hash if necesary + # Allow up to 100k over, to balance cleaning up the rev_pos hash + # and memory + if read.pos > (self.last_rev_pos + 100000): + self.last_rev_pos = start_pos + del_list = [] + for k in self.rev_pos: + if k < read.pos: + del_list.append(k) + + for k in del_list: + self.rev_pos.remove(k) + + if not start_pos in self.rev_pos: + self.rev_pos.add(start_pos) + return True + return False + else: + if read.pos != self.last_fwd_pos: + self.last_fwd_pos = read.pos + return True + return False + + # if self.last_pos != (read.tid, read.pos): + # self.last_pos = (read.tid, read.pos) + # return True + # return False + + def close(self): + pass + + +class Blacklist(object): + def __init__(self, fname): + self.fname = fname + self.notallowed = set() + with open(fname) as f: + for line in f: + self.notallowed.add(line.strip().split()[0]) + + def filter(self, bam, read): + return read.qname not in self.notallowed + + def __repr__(self): + return 'Blacklist: %s' % (self.fname) + + def close(self): + pass + + +class Whitelist(object): + def __init__(self, fname): + self.fname = fname + self.allowed = set() + with open(fname) as f: + for line in f: + self.allowed.add(line.strip().split()[0]) + + def filter(self, bam, read): + return read.qname in self.allowed + + def __repr__(self): + return 'Whitelist: %s' % (self.fname) + + def close(self): + pass + + +class IncludeRegion(object): + _excludes = [] + _last = None + + def __init__(self, region): + IncludeRegion._excludes.append(ExcludeRegion(region)) + # self.excl = ExcludeRegion(region) + + def filter(self, bam, read): + if read == IncludeRegion._last: + return True + + IncludeRegion._last = read + + for excl in IncludeRegion._excludes: + if not excl.filter(bam, read): + return True + return False + + # return not self.excl.filter(bam,read) + def __repr__(self): + return 'Including: %s' % (self.excl.region) + + def close(self): + pass + + +class IncludeBED(object): + def __init__(self, fname, nostrand=None): + self.excl = ExcludeBED(fname, nostrand) + + def filter(self, bam, read): + return not self.excl.filter(bam, read) + + def __repr__(self): + return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '') + + def close(self): + pass + + +class ExcludeRegion(object): + def __init__(self, region): + self.region = region + spl = region.split(':') + self.chrom = spl[0] + se = [int(x) for x in spl[1].split('-')] + self.start = se[0] - 1 + self.end = se[1] + + def filter(self, bam, read): + if not read.is_unmapped: + if bam.getrname(read.tid) == self.chrom: + if self.start <= read.pos <= self.end: + return False + if self.start <= read.aend <= self.end: + return False + return True + + def __repr__(self): + return 'Excluding: %s' % (self.region) + + def close(self): + pass + + +class ExcludeRef(object): + def __init__(self, ref): + self.ref = ref + + def filter(self, bam, read): + if not read.is_unmapped: + if bam.getrname(read.tid) == self.ref: + return False + return True + + def __repr__(self): + return 'Excluding: %s' % (self.ref) + + def close(self): + pass + +class IncludeRef(object): + def __init__(self, ref): + self.ref = ref + + def filter(self, bam, read): + if not read.is_unmapped: + if bam.getrname(read.tid) == self.ref: + return True + return False + + def __repr__(self): + return 'Including: %s' % (self.ref) + + def close(self): + pass + + +class ExcludeBED(object): + def __init__(self, fname, nostrand=None): + self.regions = {} # store BED regions as keyed bins (chrom, bin) + self.fname = fname + if nostrand == 'nostrand': + self.nostrand = True + else: + self.nostrand = False + + self.bed = BedFile(fname) + # with open(fname) as f: + # for line in f: + # if not line: + # continue + # if line[0] == '#': + # continue + # cols = line.strip().split('\t') + + # chrom = cols[0] + # start = int(cols[1]) + # end = int(cols[2]) + # if self.nostrand: + # strand = '?' + # else: + # strand = cols[5] + + # startbin = start / 100000 + # endbin = end / 100000 + + # for bin in xrange(startbin, endbin + 1): + # if not (chrom, bin) in self.regions: + # self.regions[(chrom, bin)] = [] + # self.regions[(chrom, bin)].append((start, end, strand)) + + def filter(self, bam, read): + if not read.is_unmapped: + if self.nostrand: + strand = None + elif read.is_reverse: + strand = '-' + else: + strand = '+' + + for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand): + # region found, exclude read + return False + return True + + # bin = read.pos / 100000 + # ref = bam.getrname(read.tid) + + # if not (ref, bin) in self.regions: + # return True + + # for start, end, strand in self.regions[(ref, bin)]: + # if not self.nostrand: + # if strand == '+' and read.is_reverse: + # continue + # if strand == '-' and not read.is_reverse: + # continue + # if start <= read.pos <= end: + # return False + # if start <= read.aend <= end: + # return False + # return True + + def __repr__(self): + return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '') + + def close(self): + pass + + +class Mismatch(object): + def __init__(self, num): + self.num = int(num) + + def filter(self, bam, read): + if read.is_unmapped: + return False + + if read_calc_mismatches(read) > self.num: + return False + + return True + + def __repr__(self): + return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es') + + def close(self): + pass + + +class MismatchRef(object): + def __init__(self, num, refname): + self.num = int(num) + self.refname = refname + + if not os.path.exists('%s.fai' % refname): + pysam.faidx(refname) + + self.ref = pysam.Fastafile(refname) + + def filter(self, bam, read): + if read.is_unmapped: + return False + + chrom = bam.getrname(read.tid) + if read_calc_mismatches_ref(self.ref, read, chrom) > self.num: + return False + + return True + + def __repr__(self): + return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname)) + + def close(self): + self.ref.close() + + +class MismatchDbSNP(object): + def __init__(self, num, fname, verbose=None): + sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n') + + self.num = int(num) + self.fname = fname + self.dbsnp = DBSNP(fname) + if verbose == 'verbose': + self.verbose = True + else: + self.verbose = False + + def filter(self, bam, read): + if read.is_unmapped: + return False + + if read_calc_mismatches(read) <= self.num: + return True + + chrom = bam.getrname(read.tid) + + mm = 0 + snps = 0 + + for op, pos, seq in read_calc_variations(read): + if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose): + mm += 1 + else: + snps += 1 + + if mm > self.num: + return False + + if snps: + read.tags = read.tags + [('ZS', snps)] + + return True + + def __repr__(self): + return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname)) + + def close(self): + self.dbsnp.close() + + +class MismatchRefDbSNP(object): + def __init__(self, num, refname, dbsnpname): + sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n') + self.num = int(num) + self.refname = refname + self.dbsnp = DBSNP(dbsnpname) + + if not os.path.exists('%s.fai' % refname): + pysam.faidx(refname) + + self.ref = pysam.Fastafile(refname) + + def filter(self, bam, read): + if read.is_unmapped: + return False + + chrom = bam.getrname(read.tid) + + mm = 0 + snps = 0 + + for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom): + if not self.dbsnp.is_valid_variation(chrom, op, pos, seq): + mm += 1 + else: + snps += 1 + + if mm > self.num: + return False + + if snps: + read.tags = read.tags + [('ZS', snps)] + + return True + + def __repr__(self): + return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname)) + + def close(self): + self.ref.close() + self.dbsnp.close() + + +class Mapped(object): + def __init__(self): + pass + + def filter(self, bam, read): + if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): + return False + elif read.is_unmapped: + return False + return True + + def __repr__(self): + return 'is mapped' + + def close(self): + pass + + +class Unmapped(object): + def __init__(self): + pass + + def filter(self, bam, read): + if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): + return True + elif read.is_unmapped: + return True + return False + + def __repr__(self): + return 'is unmapped' + + def close(self): + pass + + +class ProperPair(object): + def __init__(self): + pass + + def filter(self, bam, read): + if not read.is_paired: + return False + + if read.is_unmapped or read.mate_is_unmapped: + return False + + if read.is_reverse == read.mate_is_reverse: + return False + + return read.is_proper_pair + + def __repr__(self): + return 'proper pair' + + def close(self): + pass + + +class NoProperPair(object): + def __init__(self): + self.proper = ProperPair() + pass + + def filter(self, bam, read): + return not self.proper.filter(bam, read) + + def __repr__(self): + return 'not proper pairs' + + def close(self): + pass + + +class MaskFlag(object): + def __init__(self, value): + if type(value) == type(1): + self.flag = value + else: + if value[0:2] == '0x': + self.flag = int(value, 16) + else: + self.flag = int(value) + + def __repr__(self): + return "Doesn't match flag: %s" % self.flag + + def filter(self, bam, read): + return (read.flag & self.flag) == 0 + + def close(self): + pass + + +class SecondaryFlag(object): + def __repr__(self): + return "no 0x100 (secondary) flag" + + def filter(self, bam, read): + return not read.is_secondary + + def close(self): + pass + + +class ReadMinLength(object): + def __init__(self, minval): + self.minval = int(minval) + + def __repr__(self): + return "read length min: %s" % self.minval + + def filter(self, bam, read): + return len(read.seq) >= self.minval + + def close(self): + pass + + +class ReadMaxLength(object): + def __init__(self, val): + self.val = int(val) + + def __repr__(self): + return "read length max: %s" % self.val + + def filter(self, bam, read): + return len(read.seq) <= self.val + + def close(self): + pass + + +class MaximumMismatchRatio(object): + def __init__(self, ratio): + self.ratio = float(ratio) + + def __repr__(self): + return "maximum mismatch ratio: %s" % self.val + + def filter(self, bam, read): + return read_calc_mismatches(read) <= self.ratio*len(read.seq) + + def close(self): + pass + + +class QCFailFlag(object): + def __repr__(self): + return "no 0x200 (qcfail) flag" + + def filter(self, bam, read): + return not read.is_qcfail + + def close(self): + pass + + +class PCRDupFlag(object): + def __repr__(self): + return "no 0x400 (pcrdup) flag" + + def filter(self, bam, read): + return not read.is_duplicate + + def close(self): + pass + + +class _TagCompare(object): + def __init__(self, tag, value): + self.args = '%s %s' % (tag, value) + + if ':' in tag: + self.tag = tag.split(':')[0] + tagtype = tag.split(':')[1] + if tagtype == 'i': + self.value = int(value) + elif tagtype == 'f': + self.value = float(value) + elif tagtype == 'H': + self.value = int(value) # not sure about this + else: + self.value = value + else: + self.tag = tag + + # guess at type... + try: + self.value = int(value) + except: + try: + self.value = float(value) + except: + self.value = value + + def get_value(self, read): + if self.tag == 'MAPQ': + return read.mapq + + for name, value in read.tags: + if name == self.tag: + return value + + return None + + def __repr__(self): + return "%s %s %s" % (self.tag, self.__class__.op, self.value) + + def close(self): + pass + + +class TagLessThan(_TagCompare): + op = '<' + + def filter(self, bam, read): + if self.get_value(read) < self.value: + return True + return False + + +class TagLessThanEqual(_TagCompare): + op = '<=' + + def filter(self, bam, read): + if self.get_value(read) <= self.value: + return True + return False + + +class TagGreaterThan(_TagCompare): + op = '>' + + def filter(self, bam, read): + if self.get_value(read) > self.value: + return True + return False + + +class TagGreaterThanEqual(_TagCompare): + op = '>=' + + def filter(self, bam, read): + if self.get_value(read) >= self.value: + return True + return False + + +class TagEqual(_TagCompare): + op = '=' + + def filter(self, bam, read): + if self.get_value(read) == self.value: + return True + return False + +_criteria = { + 'mapped': Mapped, + 'unmapped': Unmapped, + 'properpair': ProperPair, + 'noproperpair': NoProperPair, + 'noqcfail': QCFailFlag, + 'nosecondary': SecondaryFlag, + 'nopcrdup': PCRDupFlag, + 'mask': MaskFlag, + 'lt': TagLessThan, + 'gt': TagGreaterThan, + 'lte': TagLessThanEqual, + 'gte': TagGreaterThanEqual, + 'eq': TagEqual, + 'mismatch': Mismatch, + 'mismatch_ref': MismatchRef, + 'mismatch_dbsnp': MismatchDbSNP, + 'mismatch_ref_dbsnp': MismatchRefDbSNP, + 'exclude': ExcludeRegion, + 'excludebed': ExcludeBED, + 'excluderef': ExcludeRef, + 'includeref': IncludeRef, + 'include': IncludeRegion, + 'includebed': IncludeBED, + 'whitelist': Whitelist, + 'blacklist': Blacklist, + 'length': ReadMinLength, # deprecated + 'minlen': ReadMinLength, + 'maxlen': ReadMaxLength, + 'uniq': Unique, + 'maximum_mismatch_ratio': MaximumMismatchRatio +} + + +def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False): + if verbose: + sys.stderr.write('Input file : %s\n' % infile) + sys.stderr.write('Output file : %s\n' % outfile) + if failedfile: + sys.stderr.write('Failed reads: %s\n' % failedfile) + sys.stderr.write('Criteria:\n') + for criterion in criteria: + sys.stderr.write(' %s\n' % criterion) + + sys.stderr.write('\n') + + bamfile = pysam.Samfile(infile, "rb") + outfile = pysam.Samfile(outfile, "wb", template=bamfile) + + if failedfile: + failed_out = open(failedfile, 'w') + else: + failed_out = None + + passed = 0 + failed = 0 + + def _callback(read): + return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed) + + for read in bam_iter(bamfile, quiet=True): + p = True + + for criterion in criteria: + if not criterion.filter(bamfile, read): + p = False + failed += 1 + if failed_out: + failed_out.write('%s\t%s\n' % (read.qname, criterion)) + #outfile.write(read_to_unmapped(read)) + break + if p: + passed += 1 + outfile.write(read) + + bamfile.close() + outfile.close() + if failed_out: + failed_out.close() + sys.stdout.write("%s kept\n%s failed\n" % (passed, failed)) + + for criterion in criteria: + criterion.close() + + +def read_to_unmapped(read): + ''' + Example unmapped read + 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4 + + Example mapped read: + 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2---------------------------- + + ''' + # TODO: rewrite read properties to unmapped state + # if colorspace: convert CS to NT directly + # remove excess tags + + read.is_unmapped = True + read.tid = None + read.pos = 0 + read.mapq = 0 + return read + +if __name__ == '__main__': + infile = None + outfile = None + failed = None + criteria = [] + + crit_args = [] + last = None + verbose = False + fail = False + + for arg in sys.argv[1:]: + if last == '-failed': + failed = arg + last = None + elif arg == '-h': + usage() + elif arg == '-failed': + last = arg + elif arg == '-v': + verbose = True + elif not infile and os.path.exists(arg): + infile = arg + elif not outfile: + outfile = arg + elif arg[0] == '-': + if not arg[1:] in _criteria: + print "Unknown criterion: %s" % arg + fail = True + if crit_args: + criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) + crit_args = [arg, ] + elif crit_args: + crit_args.append(arg) + else: + print "Unknown argument: %s" % arg + fail = True + + if not fail and crit_args: + criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) + + if fail or not infile or not outfile or not criteria: + if not infile and not outfile and not criteria: + usage() + + if not infile: + print "Missing: input bamfile" + if not outfile: + print "Missing: output bamfile" + if not criteria: + print "Missing: filtering criteria" + usage() + else: + bam_filter(infile, outfile, criteria, failed, verbose) diff -r 000000000000 -r 4e4e4093d65d macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,16 @@ + + 0.5.7 + + + pysam + + + + @WRAPPER_VERSION@ + + + + + + + diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.py diff -r 000000000000 -r 4e4e4093d65d ngsutils/__init__.pyc Binary file ngsutils/__init__.pyc has changed diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/bam/__init__.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,879 @@ +import sys +import os +import re +import pysam +try: + from eta import ETA +except: + pass +import ngsutils.support + + +def bam_open(fname, mode='r', *args, **kwargs): + if fname.lower()[-4:] == '.bam': + return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs) + return pysam.Samfile(fname, '%s' % mode, *args, **kwargs) + + +def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None): + if not quiet and bam.filename: + eta = ETA(os.stat(bam.filename).st_size) + else: + eta = None + + for pileup in bam.pileup(mask=mask): + pos = bam.tell() + bgz_offset = pos >> 16 + + if not quiet: + if callback: + eta.print_status(bgz_offset, extra=callback(pileup)) + else: + eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos)) + + yield pileup + + if eta: + eta.done() + + +def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None): + ''' + >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] + ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] + ''' + + if os.path.exists('%s.bai' % bam.filename): + # This is an indexed file, so it is ref sorted... + # Meaning that we should show chrom:pos, instead of read names + show_ref_pos = True + + eta = None + + if not ref: + if not quiet and bam.filename: + eta = ETA(os.stat(bam.filename).st_size) + + for read in bam: + pos = bam.tell() + bgz_offset = pos >> 16 + + if not quiet and eta: + if callback: + eta.print_status(bgz_offset, extra=callback(read)) + elif (show_ref_pos): + if read.tid > -1: + eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) + else: + eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname)) + else: + eta.print_status(bgz_offset, extra='%s' % read.qname) + + yield read + + else: + working_chrom = None + if ref in bam.references: + working_chrom = ref + elif ref[0:3] == 'chr': + # compensate for Ensembl vs UCSC ref naming + if ref[3:] in bam.references: + working_chrom = ref[3:] + + if not working_chrom: + raise ValueError('Missing reference: %s' % ref) + + tid = bam.gettid(working_chrom) + + if not start: + start = 0 + if not end: + end = bam.lengths[tid] + + if not quiet and bam.filename: + eta = ETA(end - start) + + for read in bam.fetch(working_chrom, start, end): + if not quiet and eta: + if callback: + eta.print_status(read.pos - start, extra=callback(read)) + else: + eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) + + yield read + + if eta: + eta.done() + + +def bam_batch_reads(bam, quiet=False): + ''' + Batch mapping for the same reads (qname) together, this way + they can all be compared/converted together. + ''' + reads = [] + last = None + for read in bam_iter(bam, quiet=quiet): + if last and read.qname != last: + yield reads + reads = [] + last = read.qname + reads.append(read) + + if reads: + yield reads + + +bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'] +bam_cigar_op = { + 'M': 0, + 'I': 1, + 'D': 2, + 'N': 3, + 'S': 4, + 'H': 5, + 'P': 6, + '=': 7, + 'X': 8, +} + + +def cigar_fromstr(s): + ''' + >>> cigar_fromstr('10M5I20M') + [(0, 10), (1, 5), (0, 20)] + >>> cigar_fromstr('1M1I1D1N1S1H1P') + [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] + ''' + ret = [] + spl = re.split('([0-9]+)', s)[1:] + for op, size in zip(spl[1::2], spl[::2]): + ret.append((bam_cigar_op[op], int(size))) + return ret + + +def cigar_tostr(cigar): + ''' + >>> cigar_tostr(((0, 10), (1, 5), (0, 20))) + '10M5I20M' + >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1))) + '1M1I1D1N1S1H1P' + ''' + + s = '' + + for op, size in cigar: + s += '%s%s' % (size, bam_cigar[op]) + + return s + + +def cigar_read_len(cigar): + ''' + >>> cigar_read_len(cigar_fromstr('8M')) + 8 + >>> cigar_read_len(cigar_fromstr('8M100N8M')) + 16 + >>> cigar_read_len(cigar_fromstr('8M10I8M')) + 26 + >>> cigar_read_len(cigar_fromstr('8M10D8M')) + 16 + ''' + + read_pos = 0 + + for op, length in cigar: + if op == 0: # M + read_pos += length + elif op == 1: # I + read_pos += length + elif op == 2: # D + pass + elif op == 3: # N + pass + elif op == 4: # S + read_pos += length + elif op == 5: # H + pass + elif op == 7: # = + read_pos += length + elif op == 8: # X + read_pos += length + else: + raise ValueError("Unsupported CIGAR operation: %s" % op) + + return read_pos + + +def read_calc_mismatches(read): + inserts = 0 + deletions = 0 + indels = 0 + edits = int(read.opt('NM')) + # + # NM counts the length of indels + # We really just care about *if* there is an indel, not the size + # + + for op, length in read.cigar: + if op == 1: + inserts += length + indels += 1 + elif op == 2: + deletions += length + indels += 1 + + return edits - inserts - deletions + indels + + +def _extract_md_matches(md, maxlength): + md_pos = 0 + + while md and md_pos < maxlength: + tmp = '0' # preload a zero so that immediate mismatches will be caught + # the zero will have no affect otherwise... + + # look for matches + while md and md[0] in '0123456789': + tmp += md[0] + md = md[1:] + + pos = int(tmp) + if pos > maxlength: + return (maxlength, '%s%s' % (pos - maxlength, md)) + return (pos, md) + + +def read_calc_variations(read): + 'see _read_calc_variations' + for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq): + yield tup + + +def _read_calc_variations(start_pos, cigar, md, seq): + ''' + For each variation, outputs a tuple: (op, pos, seq) + + op - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR) + pos - 0-based position of the variation (relative to reference) + seq - the base (or bases) involved in the variation + for mismatch or insert, this is the sequence inserted + for deletions, this is the reference sequence that was removed + + MD is the mismatch string. Not all aligners include the tag. If your aligner + doesn't include this, then you'll need to add it, or use a different function + (see: read_calc_mismatches_gen). + + Special care must be used to handle RNAseq reads that cross + an exon-exon junction. + + Also: MD is a *really* dumb format that can't be read correctly with + a regex. It must be processed in concert with the CIGAR alignment + in order to catch all edge cases. Some implementations insert 0's + at the end of inserts / deltions / variations to make parsing easier + but not everyone follows this. Look at the complex examples: the + CIGAR alignment may show an insert, but the MD just shows all matches. + + Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/ + Also from CCBB actual mappings and manual altered (shortened, + made more complex) + (doctests included) + + Match/mismatch + CIGAR: 36M + MD:Z: 1A0C0C0C1T0C0T27 + MD:Z: 1ACCC1TCT27 (alternative) + 1 2 + 123456789012345678901234567890123456 + ref: CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG + XXXX XXX + read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG + MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM + -ACCC-TCT--------------------------- + >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG')) + [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')] + + Insert + CIGAR: 6M1I29M + MD:Z: 0C1C0C1C0T0C27 + C1CC1CTC27 (alt) + 1 2 + 123456^789012345678901234567890123456 + ref: CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT + X XX X|XX + read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT + MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM + G-GA-GGGG--------------------------- + >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) + [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] + >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) + [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] + + + Deletion + CIGAR: 9M9D27M + MD:Z: 2G0A5^ATGATGTCA27 + 2GA5^ATGATGTCA27 (alt) + ref: AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC + XX ^^^^^^^^^ + read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC + MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM + --TG-----ATGATGTCA--------------------------- + >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) + [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')] + + + Complex + CIGAR: 9M9D11M1I15M + MD:Z: 2G0A5^ATGATGTCAA26 + MD:Z: 2G0A5^ATGATGTCA0G26 (alt) + 1 2 3 4 + pos: 123456789012345678901234567890123456789012345 + ref: AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC + XX ^^^^^^^^^X | + read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC + MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM + --TG-----ATGATGTCAG----------T--------------- + >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) + [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')] + + + Complex example - inserts aren't separately handled by MD, only visible in CIGAR + CIGAR: 14M2D16M3I42M + MD:Z: 14^TC58 + 1 2 3 4 5 6 7 + pos: 12345678901234567890123456789012^^^345678901234567890123456789012345678901234567 + ref: caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct + ^^ ||| + read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT + MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM + --------------tc----------------TGG------------------------------------------ + >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT')) + [(2, 15, 'TC'), (1, 33, 'TGG')] + + + Complex example 2: + CIGAR: 41M3I10M1I5M1I2M2I10M + MD:Z: 44C2C6T6T6 + 1 2 3 4 5 6 + pos: 12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678 + ref: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG + ||| X X | X | || X + read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG + MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM + -----------------------------------------tta---A--T---c---A----gt---G------ + + + 13M 28M 3I 10M 1I 5M 1I 2M 2I 10M + >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) + [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')] + + + Splice junction example: + CIGAR: 62M100N13M + MD:Z: 2T27C44 + 1 1 + 1 2 3 4 5 6 6 7 + pos: 12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345 + ref: CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC + X X + read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC + --G---------------------------T----------------------------------------------------- + + >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC')) + [(0, 3, 'G'), (0, 31, 'T')] + + + Splice junction example 2: + CIGAR: 13M100N28M3I10M1I5M1I2M2I10M + MD:Z: 44C2C6T6T6 + 1 1 1 1 1 + 1 2 3 4 5 6 + pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 + ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG + ||| X X | X | || X + read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG + MMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM + ------------- ----------------------------tta---A--T---c---A----gt---G------ + + 13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M + >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) + [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] + + + Splice junction example 2A: + CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M + MD:Z: 9A10^GG22C2C6T6T6 + 1 1 1 1 1 + 1 2 3 4 5 6 + pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 + ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG + ^^ ||| X X | X | || X + read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG + MMMMMMMMMMMMM MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM + ---------C--- ----------------------------tta---A--T---c---A----gt---G------ + .........A... .......GG................... ...C..C... ...T. .. ...T...... + 9 A 10 ^GG 22 C 2C 6 T 6 T 6 + + >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) + [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] + + Real Example + 242_1071_1799_B1 + CIGAR: 42M10I3M1D9M1D11M + MD:Z: 27G16A0^T6C2^T1C9 + 1 2 3 4 5 6 7 + pos: 123456789012345678901234567890123456789012 345678901234567890123456789012345 + ref: ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG + X X^ X ^ X + read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG + MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM + ---------------------------A--------------^^^^^^^^^^--CT------G--T-G--------- + + >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG', )) + [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')] + + + Real example 2 + 577_1692_891_A1 + CIGAR: 34M100N39M2I + MD:Z: 3T69 + 1 1 1 1 + 1 2 3 4 5 6 7 + pos: 1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123 + ref: GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^ + + read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC + MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNN MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII + ---A------------------------------ ---------------------------------------TC + + >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC', )) + [(0, 4, 'A'), (1, 174, 'TC')] + + ''' + + ref_pos = start_pos + read_pos = 0 + + for op, length in cigar: + if md and md[0] == '0': + md = md[1:] + # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) + if op == 0: # M + # how far in the chunk are we? (do *not* update ref_pos until end) + md_pos = 0 + last = None + while md and md_pos < length: + if last == (op, length, md): + sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length)) + sys.exit(1) + last = (op, length, md) + # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) + chunk_size, md = _extract_md_matches(md, length - md_pos) + # sys.stderr.write(' -> %s, %s\n' %(chunk_size, md)) + md_pos += chunk_size + + # look for mismatches + while md_pos < length and md and md[0] not in '0123456789^': + yield (op, ref_pos + md_pos, seq[read_pos + md_pos]) + md = md[1:] + + md_pos += 1 + + ref_pos += length + read_pos += length + + elif op == 1: # I + # nothing in MD about inserts... + yield (op, ref_pos, seq[read_pos:read_pos + length]) + read_pos += length + + elif op == 2: # D + # prefixed with '^' and includes all of the removed bases + if md[0] == '^': + md = md[1:] + yield (op, ref_pos, md[:length]) + md = md[length:] + ref_pos += length + + elif op == 3: # N + ref_pos += length + + +def read_calc_mismatches_gen(ref, read, chrom): + start = read.pos + ref_pos = 0 + read_pos = 0 + + for op, length in read.cigar: + if op == 1: + yield ref_pos, op, None + read_pos += length + elif op == 2: + yield ref_pos, op, None + ref_pos += length + elif op == 3: + ref_pos += length + elif op == 0: + refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length) + if not refseq: + raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename)) + cur_pos = start + ref_pos + for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()): + if refbase != readbase: + yield op, cur_pos, readbase + cur_pos += 1 + ref_pos += length + read_pos += length + else: + raise ValueError("Unsupported CIGAR operation: %s" % op) + + +def read_calc_mismatches_ref(ref, read, chrom): + edits = 0 + + for op, pos, base in read_calc_mismatches_gen(ref, read, chrom): + edits += 1 + + return edits + + +__region_cache = {} + + +def region_pos_to_genomic_pos(name, start, cigar): + ''' + converts a junction position to a genomic location given a junction + ref name, the junction position, and the cigar alignment. + + returns: (genomic ref, genomic pos, genomic cigar) + + >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)]) + ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)]) + + >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)]) + ('chr1', 1025, [(0, 25), (3, 0), (0, 75)]) + + >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)]) + ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)]) + + >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)]) + ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)]) + + >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) + ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) + + >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) + ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) + + >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S')) + ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)]) + + + ''' + + if name in __region_cache: + chrom, fragments = __region_cache[name] + else: + c1 = name.split(':') + chrom = c1[0] + + fragments = [] + for fragment in c1[1].split(','): + s, e = fragment.split('-') + fragments.append((int(s), int(e))) + + __region_cache[name] = (chrom, fragments) + + chr_cigar = [] + chr_start = fragments[0][0] + + read_start = int(start) + + frag_idx = 0 + frag_start = 0 + frag_end = 0 + + for i, (s, e) in enumerate(fragments): + if chr_start + read_start < e: + chr_start += read_start + frag_idx = i + frag_start = s + frag_end = e + break + + else: + chr_start += (e - s) + read_start -= (e - s) + + cur_pos = chr_start + + for op, length in cigar: + if op in [1, 4, 5]: + chr_cigar.append((op, length)) + + elif op in [0, 2]: + if cur_pos + length <= frag_end: + cur_pos += length + chr_cigar.append((op, length)) + + else: + while cur_pos + length > frag_end: + if frag_end - cur_pos > 0: + chr_cigar.append((op, frag_end - cur_pos)) + length -= (frag_end - cur_pos) + cur_pos = frag_end + frag_idx += 1 + if len(fragments) <= frag_idx: + print 'ERROR converting: ', name, fragments + return (chrom, 0, chr_cigar) + frag_start, frag_end = fragments[frag_idx] + chr_cigar.append((3, frag_start - cur_pos)) + cur_pos = frag_start + + cur_pos = cur_pos + length + chr_cigar.append((op, length)) + else: + print "Unsupported CIGAR operation (%s)" % bam_cigar[op] + sys.exit(1) + + return (chrom, chr_start, chr_cigar) + + +def is_junction_valid(cigar, min_overlap=4): + ''' + Does the genomic cigar alignment represent a 'good' alignment. + Used for checking junction->genome alignments + + 1) the alignment must not start at a splice junction + 2) the alignment must not start or end with an overhang + 3) the alignment must overhang the splice junction by min_overlap (4) + + | Exon1 | Intron | Exon2 | + |-----------------|oooooooooooooooo|------------------| + XXXXXXXXXXXXXXXXXXXXXXXX (bad 1) + XXXXXXXXXXXXX (bad 2) XXXXXXXXXXXXXXXX (bad 2) + XX-----------------XXXXXXXXXXXXXXXXX (bad 3) + + >>> is_junction_valid(cigar_fromstr('1000N40M')) + (False, 'Starts at gap (1000N40M)') + + >>> is_junction_valid(cigar_fromstr('100M')) + (False, "Doesn't cover junction") + + >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4) + (False, "Too short overlap at 3' (100M1000N3M)") + + >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4) + (False, "Too short overlap at 5' (2M1000N100M)") + + >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4) + (True, '') + + >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4) + (True, '') + + >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4) + (True, '') + + ''' + first = True + pre_gap = True + + pre_gap_count = 0 + post_gap_count = 0 + + has_gap = False + + for op, length in cigar: + # mapping can't start at a gap + if first and op == 3: + return (False, 'Starts at gap (%s)' % cigar_tostr(cigar)) + first = False + + if op == 3: + pre_gap = False + post_gap_count = 0 + has_gap = True + + elif pre_gap: + pre_gap_count += length + else: + post_gap_count += length + + # mapping must start with more than min_overlap base match + + if not has_gap: + return (False, "Doesn't cover junction") + elif pre_gap_count < min_overlap: + return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar)) + elif post_gap_count < min_overlap: + return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar)) + + return True, '' + + +def read_alignment_fragments_gen(read): + ''' + Takes a read and returns the start and end positions for each match/mismatch region. + This will let us know where each read alignment "touches" the genome. + ''' + + for start, end in _read_alignment_fragments_gen(read.pos, read.cigar): + yield (start, end) + + +def _read_alignment_fragments_gen(pos, cigar): + ''' Test-able version of read_alignment_fragments_gen + >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M'))) + [(1, 51)] + + >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M'))) + [(1, 26), (126, 151)] + + >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M'))) + [(1, 21), (22, 26), (126, 136), (136, 146)] + ''' + ref_pos = pos + + for op, length in cigar: + if op == 0: + yield (ref_pos, ref_pos + length) + ref_pos += length + elif op == 1: + pass + elif op == 2: + ref_pos += length + elif op == 3: + ref_pos += length + else: + raise ValueError("Unsupported CIGAR operation: %s" % op) + + +def read_cigar_at_pos(cigar, qpos, is_del): + ''' + Returns the CIGAR operation for a given read position + + qpos is the 0-based index of the base in the read + ''' + pos = 0 + returnnext = False + for op, length in cigar: + if returnnext: + return op + if op == 0: + pos += length + elif op == 1: + pos += length + elif op == 2: + pass + elif op == 3: + pass + elif op == 4: + pos += length + elif op == 5: + pass + else: + raise ValueError("Unsupported CIGAR operation: %s" % op) + + if pos > qpos: + return op + if is_del and pos == qpos: + returnnext = True + + return None + + +def cleancigar(cigar): + ''' + Cleans a CIGAR alignment to remove zero length operations + + >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S'))) + '83M18S' + + ''' + newcigar = [] + + changed = False + zero = False + + for op, size in cigar: + if size > 0: + if zero and newcigar: + if newcigar[-1][0] == op: + newsize = newcigar[-1][1] + size + newcigar[-1] = (op, newsize) + zero = 0 + else: + newcigar.append((op, size)) + else: + changed = True + zero = True + + if changed: + return newcigar + + return None + + +def read_cleancigar(read): + ''' + Replaces the CIGAR string for a read to remove any operations that are zero length. + + This happens to the read in-place + ''' + if read.is_unmapped: + return False + + newcigar = [] + + newcigar = cleancigar(read.cigar) + + if newcigar: + read.cigar = newcigar + return True + + return False + + +def read_to_unmapped(read, ref=None): + ''' + Converts a read from mapped to unmapped. + + Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed) + ''' + + newread = pysam.AlignedRead() + + if ref: + tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))] + + newread.is_unmapped = True + newread.mapq = 0 + newread.tlen = 0 + newread.pos = -1 + newread.pnext = -1 + newread.rnext = -1 + newread.tid = -1 + + newread.qname = read.qname + + if read.is_paired: + newread.is_paired = True + + if not read.is_unmapped and read.is_reverse: + newread.seq = ngsutils.support.revcomp(read.seq) + newread.qual = read.qual[::-1] + else: + newread.seq = read.seq + newread.qual = read.qual + + newread.tags = tags + + return newread + + + +if __name__ == '__main__': + import doctest + doctest.testmod() diff -r 000000000000 -r 4e4e4093d65d ngsutils/bam/__init__.pyc Binary file ngsutils/bam/__init__.pyc has changed diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/bed/__init__.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,282 @@ +import os +import ngsutils.support.ngs_utils +import pysam + + +class BedStreamer(object): + ''' + Streams BedRegions from a BED file + + Note - this can only be used once! There is no mechanism to seek the stream. + ''' + + def __init__(self, fname=None, fileobj=None, quiet=False): + if not fname and not fileobj: + raise ValueError("You must specify either fname or fileobj!") + + self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj) + + def __iter__(self): + return self + + def next(self): + try: + while True: + line = self.reader.next().strip() + if line and line[0] != '#': + cols = line.split('\t') + while len(cols) < 6: + cols.append('') + + return BedRegion(*cols) + except: + raise StopIteration + + + +class BedFile(object): + ''' + BED files are read in their entirety memory, in a series of bins. Each bin + is ~100kb in size. Each bin can then be iterated over. + + This is less efficient than using a proper index, but in reality, this + usually isn't an issue. However, if the BED file has been Tabix indexed, + that index will be used for random access. + + NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile + and a BedFileIndex where the file is indexed only if random access is requested. + ''' + + _bin_const = 100000 + + def __init__(self, fname=None, fileobj=None, region=None): + self._bins = {} + self._bin_list = [] + self._cur_bin_idx = 0 + self._cur_bin_pos = 0 + self._tellpos = 0 + self._total = 0 + self._length = 0 + self.__tabix = None + + self.filename = fname + + if os.path.exists('%s.tbi' % fname): + self.__tabix = pysam.Tabixfile(fname) + + if fileobj: + self.__readfile(fileobj) + elif fname: + with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj: + self.__readfile(fobj) + elif region: + chrom, startend = region.split(':') + if '-' in startend: + start, end = [int(x) for x in startend.split('-')] + else: + start = int(startend) + end = start + start -= 1 + + self.__add_region(BedRegion(chrom, start, end)) + else: + raise ValueError("Must specify either filename, fileobj, or region") + + def __readfile(self, fobj): + for line in fobj: + line = line.strip() + if line and line[0] != '#': + cols = line.split('\t') + while len(cols) < 6: + cols.append('') + + region = BedRegion(*cols) + self.__add_region(region) + + self._bin_list.sort() + for bin in self._bins: + self._bins[bin].sort() + + def __add_region(self, region): + self._total += region.end - region.start + self._length += 1 + + startbin = region.start / BedFile._bin_const + endbin = region.end / BedFile._bin_const + + for bin in xrange(startbin, endbin + 1): + if not (region.chrom, bin) in self._bins: + self._bin_list.append((region.chrom, bin)) + self._bins[(region.chrom, bin)] = [] + self._bins[(region.chrom, bin)].append(region) + + def fetch(self, chrom, start, end, strand=None): + ''' + For TABIX indexed BED files, find all regions w/in a range + + For non-TABIX index BED files, use the calculated bins, and + output matching regions + ''' + + if self.__tabix: + for match in self.__tabix.fetch(chrom, start, end): + region = BedRegion(*match.split('\t')) + if not strand or (strand and region.strand == strand): + yield region + else: + startbin = start / BedFile._bin_const + endbin = end / BedFile._bin_const + + buf = set() + + for bin in xrange(startbin, endbin + 1): + if (chrom, bin) in self._bins: + for region in self._bins[(chrom, bin)]: + if strand and strand != region.strand: + continue + if start <= region.start <= end or start <= region.end <= end: + if not region in buf: + yield region + buf.add(region) + elif region.start < start and region.end > end: + if not region in buf: + yield region + buf.add(region) + + def tell(self): + return self._tellpos + + def close(self): + pass + + @property + def length(self): + return self._length + + @property + def total(self): + return self._total + + def __iter__(self): + self._cur_bin_idx = 0 + self._cur_bin_pos = 0 + self._tellpos = 0 + return self + + def next(self): + if self._cur_bin_idx >= len(self._bin_list): + raise StopIteration + + binvals = self._bins[self._bin_list[self._cur_bin_idx]] + while self._cur_bin_pos < len(binvals): + val = binvals[self._cur_bin_pos] + self._cur_bin_pos += 1 + + startbin = (val.chrom, val.start / BedFile._bin_const) + if startbin == self._bin_list[self._cur_bin_idx]: + self._tellpos += 1 + return val + + self._cur_bin_idx += 1 + self._cur_bin_pos = 0 + return self.next() + + +class BedRegion(object): + def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args): + self.chrom = chrom + self.start = int(start) + self.end = int(end) + self.name = name + + if score == '': + self.score = 0 + else: + self.score = float(score) + + if strand == '': + self.strand = None + else: + self.strand = strand + + if thickStart == '': + self.thickStart = None + else: + self.thickStart = thickStart + + if thickEnd == '': + self.thickEnd = None + else: + self.thickEnd = thickEnd + + if rgb == '': + self.rgb = None + else: + self.rgb = rgb + + self.extras = args + + def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args): + cols = [] + cols.append(self.chrom if chrom is None else chrom) + cols.append(self.start if start is None else start) + cols.append(self.end if end is None else end) + cols.append(self.name if name is None else name) + cols.append(self.score if score is None else score) + cols.append(self.strand if strand is None else strand) + cols.append(self.thickStart if thickStart is None else thickStart) + cols.append(self.thickEnd if thickEnd is None else thickEnd) + cols.append(self.rgb if rgb is None else rgb) + + for i, val in enumerate(self.extras): + if len(args) > i: + cols.append(args[i]) + else: + cols.append(val) + + return BedRegion(*cols) + + @property + def score_int(self): + score = str(self.score) + if score[-2:] == '.0': + score = score[:-2] + + return score + + def __key(self): + return (self.chrom, self.start, self.end, self.strand, self.name) + + def __lt__(self, other): + return self.__key() < other.__key() + + def __gt__(self, other): + return self.__key() > other.__key() + + def __eq__(self, other): + return self.__key() == other.__key() + + def write(self, out): + out.write('%s\n' % self) + + def __repr__(self): + outcols = [] + + if self.rgb: + outcols.append(self.rgb) + if self.thickEnd or outcols: + outcols.append(self.thickEnd if self.thickEnd else self.end) + if self.thickStart or outcols: + outcols.append(self.thickStart if self.thickStart else self.start) + if self.strand or outcols: + outcols.append(self.strand) + if self.score_int != '' or outcols: + outcols.append(self.score_int) + if self.name or outcols: + outcols.append(self.name) + + outcols.append(self.end) + outcols.append(self.start) + outcols.append(self.chrom) + + return '\t'. join([str(x) for x in outcols[::-1]]) diff -r 000000000000 -r 4e4e4093d65d ngsutils/bed/__init__.pyc Binary file ngsutils/bed/__init__.pyc has changed diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/__init__.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,249 @@ +import collections +import gzip +import os +import sys +import re +try: + from eta import ETA +except: + pass + +class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')): + def __repr__(self): + if self.comment: + return '>%s %s\n%s\n' % (self.name, self.comment, self.seq) + return '>%s\n%s\n' % (self.name, self.seq) + + def subseq(self, start, end, comment=None): + if self.comment: + comment = '%s %s' % (self.comment, comment) + + return FASTARead(self.name, comment, self.seq[start:end]) + + def clone(self, name=None, comment=None, seq=None): + n = name if name else self.name + c = comment if comment else self.comment + s = seq if seq else self.seq + + return FASTARead(n, c, s) + + def write(self, out): + out.write(repr(self)) + + +class FASTA(object): + def __init__(self, fname=None, fileobj=None, qual=False): + self.fname = fname + self.qual = qual + if fileobj: + self.fileobj = fileobj + else: + if self.fname == '-': + self.fileobj = sys.stdin + elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz': + self.fileobj = gzip.open(os.path.expanduser(self.fname)) + else: + self.fileobj = open(os.path.expanduser(self.fname)) + + if not self.fileobj: + raise ValueError("Missing valid filename or fileobj") + + def close(self): + if self.fileobj != sys.stdout: + self.fileobj.close() + + def tell(self): + # always relative to uncompressed... + return self.fileobj.tell() + + def seek(self, pos, whence=0): + self.fileobj.seek(pos, whence) + + def fetch(self, quiet=False): + name = '' + comment = '' + seq = '' + + if not quiet and self.fname and self.fname != '-': + eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) + else: + eta = None + + for line in self.fileobj: + line = line.strip() + if not line: + continue + if line[0] == '#': + continue + + if line[0] == '>': + if name and seq: + if eta: + eta.print_status(extra=name) + yield FASTARead(name, comment, seq) + + spl = re.split(r'[ \t]', line[1:], maxsplit=1) + name = spl[0] + if len(spl) > 1: + comment = spl[1] + else: + comment = '' + seq = '' + + else: + if self.qual: + seq = seq + ' ' + line + else: + seq += line + + if name and seq: + if eta: + eta.print_status(extra=name) + yield FASTARead(name, comment, seq) + + if eta: + eta.done() + + +def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None): + if fileobj: + f = fileobj + elif fname == '-': + f = sys.stdin + elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': + f = gzip.open(os.path.expanduser(fname)) + else: + f = open(os.path.expanduser(fname)) + + if quiet or fname == '-': + eta = None + else: + eta = ETA(os.stat(fname).st_size, fileobj=f) + + for line in f: + if eta: + if callback: + extra = callback() + else: + extra = '' + + eta.print_status(extra=extra) + yield line + + if done_callback and done_callback(): + break + + if f != sys.stdin: + f.close() + + if eta: + eta.done() + + +class Symbolize(object): + 'Converts strings to symbols - basically a cache of strings' + def __init__(self): + self.__cache = {} + + def __getitem__(self, k): + if not k in self.__cache: + self.__cache[k] = k + + return self.__cache[k] + +symbols = Symbolize() + +_compliments = { +'a': 't', +'A': 'T', +'c': 'g', +'C': 'G', +'g': 'c', +'G': 'C', +'t': 'a', +'T': 'A', +'n': 'n', +'N': 'N' +} + + +def revcomp(seq): + ''' + >>> revcomp('ATCGatcg') + 'cgatCGAT' + ''' + ret = [] + + for s in seq: + ret.append(_compliments[s]) + + ret.reverse() + return ''.join(ret) + + +class Counts(object): + ''' + Setup simple binning. Bins are continuous 0->max. Values are added to + bins and then means / distributions can be calculated. + ''' + def __init__(self): + self.bins = [] + + def add(self, val): + while len(self.bins) <= val: + self.bins.append(0) + self.bins[val] += 1 + + def mean(self): + acc = 0 + count = 0 + + for i, val in enumerate(self.bins): + acc += (i * val) + count += val + + if count > 0: + return float(acc) / count + + def max(self): + return len(self.bins) - 1 + + +def memoize(func): + if 'TESTING' in os.environ or 'DEBUG' in os.environ: + return func + + __cache = {} + def inner(*args, **kwargs): + k = (args, tuple(kwargs.iteritems())) + if k not in __cache: + __cache[k] = func(*args, **kwargs) + return __cache[k] + + inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__) + return inner + + +def quoted_split(s, delim, quote_char='"'): + tokens = [] + + buf = "" + inquote = False + + for c in s: + if inquote: + buf += c + if c == quote_char: + inquote = False + elif c == delim: + tokens.append(buf) + buf = "" + else: + buf += c + if c == quote_char: + inquote = True + + if buf: + tokens.append(buf) + + return tokens diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/__init__.pyc Binary file ngsutils/support/__init__.pyc has changed diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/bgzip.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/bgzip.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,137 @@ +#!/usr/bin/env python +''' +Extract the blocks from a BGZip file. + +BAM files are stored as blocks in a bgzip archive. This class +will load the bgzip archive and output the block information. +''' + +import sys +import os +import struct + + +class BGZip(object): + def __init__(self, fname): + self.fname = fname + self.pos = 0 + self.fileobj = open(self.fname) + self.fsize = os.stat(self.fname).st_size + self.cur_chunk = 0 + + self.cpos = 0 + self.cdata = 0 + + def close(self): + self.fileobj.close() + + def seek(self, offset): + bgz_offset = offset >> 16 + chunk_offset = offset & 0xFFFF + + self.fileobj.seek(bgz_offset) + self.read_chunk() + self.chunk_pos = chunk_offset + + def read(self, amount, whence=0): + if whence not in [0, 1]: + print "Bad Whence value!: %s" % whence + return + + if whence == 0: + self.seek(0, 0) + + ### read into chunk, if not enough data in chunk, read next chunk + ret = '' + while amount and self.pos < self.fsize: + if len(self.cdata) - self.cpos < amount: + ret += self.cdata[self.cpos:self.cpos + amount] + self.cpos += amount + return ret + else: + ret += self.cdata[self.cpos:] + amount = amount - len(ret) + self.read_chunk() + return ret + + def read_chunk(self): + self.fileobj.seek(10) + id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('', op, chrom, pos, base + print snp + print snp.alleles + if exit: + sys.exit(1) + + def is_valid_variation(self, chrom, op, pos, seq, verbose=False): + for snp in self.fetch(chrom, pos): + if not '/' in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']: + # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc) + continue + + if op == 0: + if snp.clazz in ['single', 'mixed'] and seq in snp.alleles: + return True + # elif verbose: + # for alt in snp.alleles: + # if len(alt) > 1: + # self.dump(chrom, op, pos, seq, snp, False) + + elif op == 1: + if snp.clazz in ['insertion', 'mixed', 'in-del']: + if seq in snp.alleles: + if len(seq) > 1 and verbose: + self.dump(chrom, op, pos, seq, snp, False) + return True + + if verbose: + if len(seq) > 1: + self.dump(chrom, op, pos, seq, snp, False) + else: + for alt in snp.alleles: + if len(alt) > 1: + self.dump(chrom, op, pos, seq, snp, False) + + elif op == 2: + if snp.clazz in ['deletion', 'mixed', 'in-del']: + if '-' in snp.alleles and seq in snp.alleles: + if len(seq) > 1 and verbose: + self.dump(chrom, op, pos, seq, snp, False) + return True + + return False diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/dbsnp.pyc Binary file ngsutils/support/dbsnp.pyc has changed diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/llh.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/llh.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,55 @@ +''' +Methods for calculating log-likelihoods for nucleotide frequencies +''' +import math +import collections +from ngsutils.support import memoize + +_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2} + +NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo') + +@memoize +def pseudo_count(N, bg): + ''' + >>> pseudo_count(100, _default_background['A']) + 3 + >>> pseudo_count(100, _default_background['C']) + 2 + ''' + + return bg * math.sqrt(N) + + +def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'): + if pseudo == 'auto': + N = A + C + G + T + Ap = float(A) + pseudo_count(N, bg['A']) + Cp = float(C) + pseudo_count(N, bg['C']) + Gp = float(G) + pseudo_count(N, bg['G']) + Tp = float(T) + pseudo_count(N, bg['T']) + elif pseudo: + Ap = float(A) + pseudo + Cp = float(C) + pseudo + Gp = float(G) + pseudo + Tp = float(T) + pseudo + else: + Ap = float(A) + Cp = float(C) + Gp = float(G) + Tp = float(T) + + Np = Ap + Cp + Gp + Tp + + freqA = Ap / Np + freqC = Cp / Np + freqG = Gp / Np + freqT = Tp / Np + + return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo) + + + +if __name__ == '__main__': + import doctest + doctest.testmod() diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/ngs_utils.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,225 @@ +#!/usr/bin/env python +""" + +Common util classes / functions for the NGS project + +""" +import sys +import os +import gzip +import re +import collections + + +def format_number(n): + ''' + >>> format_number(1000) + '1,000' + >>> format_number(1234567) + '1,234,567' + ''' + ar = list(str(n)) + for i in range(len(ar))[::-3][1:]: + ar.insert(i + 1, ',') + return ''.join(ar) + + +def natural_sort(ar): + ''' + >>> natural_sort('1 3 4 2 5'.split()) + ['1', '2', '3', '4', '5'] + >>> natural_sort('1 10 20 2 3 4'.split()) + ['1', '2', '3', '4', '10', '20'] + ''' + to_sort = [] + for item in ar: + spl = re.split('(\d+)', item) + l2 = [] + for el in spl: + try: + n = int(el) + except: + n = el + l2.append(n) + to_sort.append((l2, item)) + + to_sort.sort() + return [x[1] for x in to_sort] + + +def dictify(values, colnames): + """ + Convert a list of values into a dictionary based upon given column names. + + If the column name starts with an '@', the value is assumed to be a comma + separated list. + + If the name starts with a '#', the value is assumed to be an int. + + If the name starts with '@#', the value is assumed to a comma separated + list of ints. + + """ + d = {} + for i in xrange(len(colnames)): + key = colnames[i] + split = False + num = False + + if key[0] == '@': + key = key[1:] + split = True + if key[0] == '#': + key = key[1:] + num = True + + if i < len(values): + if num and split: + val = [int(x) for x in values[i].rstrip(',').split(',')] + elif num: + val = int(values[i]) + elif split: + val = values[i].rstrip(',').split(',') + else: + val = values[i] + + d[key] = val + + else: + d[key] = None + + return d + + +def gzip_aware_open(fname): + if fname == '-': + f = sys.stdin + elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': + f = gzip.open(os.path.expanduser(fname)) + else: + f = open(os.path.expanduser(fname)) + return f + + +class gzip_opener: + ''' + A Python 2.6 class to handle 'with' opening of text files that may + or may not be gzip compressed. + ''' + def __init__(self, fname): + self.fname = fname + + def __enter__(self): + self.f = gzip_aware_open(self.fname) + return self.f + + def __exit__(self, type, value, traceback): + if self.f != sys.stdin: + self.f.close() + return False + + +def filenames_to_uniq(names, new_delim='.'): + ''' + Given a set of file names, produce a list of names consisting of the + uniq parts of the names. This works from the end of the name. Chunks of + the name are split on '.' and '-'. + + For example: + A.foo.bar.txt + B.foo.bar.txt + returns: ['A','B'] + + AA.BB.foo.txt + CC.foo.txt + returns: ['AA.BB','CC'] + + >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split()) + ['a', 'b'] + >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split()) + ['a.b', 'c'] + + ''' + name_words = [] + maxlen = 0 + for name in names: + name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split()) + name_words[-1].reverse() + if len(name_words[-1]) > maxlen: + maxlen = len(name_words[-1]) + + common = [False, ] * maxlen + for i in xrange(maxlen): + last = None + same = True + for nameword in name_words: + if i >= len(nameword): + same = False + break + if not last: + last = nameword[i] + elif nameword[i] != last: + same = False + break + common[i] = same + + newnames = [] + for nameword in name_words: + nn = [] + for (i, val) in enumerate(common): + if not val and i < len(nameword): + nn.append(nameword[i]) + nn.reverse() + newnames.append(new_delim.join(nn)) + return newnames + + +def parse_args(argv, defaults=None, expected_argc=0): + opts = {} + if defaults: + opts.update(defaults) + + args = [] + + i = 0 + while i < len(argv): + if argv[i][0] == '-': + arg = argv[i].lstrip('-') + if '=' in arg: + k, v = arg.split('=', 2) + if k in defaults: + if type(defaults[k]) == float: + opts[k] = float(v) + elif type(defaults[k]) == int: + opts[k] = int(v) + else: + opts[k] = v + else: + opts[arg] = True + else: + args.append(argv[i]) + i += 1 + + while len(args) < expected_argc: + args.append(None) + return opts, args + + +class memoize(object): + 'Simple memoizing decorator to cache results' + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + if not isinstance(args, collections.Hashable): + # uncacheable. a list, for instance. + # better to not cache than blow up. + return self.func(*args) + + if args in self.cache: + return self.cache[args] + else: + value = self.func(*args) + self.cache[args] = value + return value diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/ngs_utils.pyc Binary file ngsutils/support/ngs_utils.pyc has changed diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/regions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/regions.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,166 @@ +class RangeMatch(object): + ''' + Simple genomic ranges. You can define chrom:start-end ranges, then ask if a + particular genomic coordinate maps to any of those ranges. This is less- + efficient than an R-Tree, but easier to code. + ''' + def __init__(self, name): + self.ranges = {} + self.name = name + + def add_range(self, chrom, strand, start, end): + if not chrom in self.ranges: + self.ranges[chrom] = {} + + bin = start / 100000 + if not bin in self.ranges[chrom]: + self.ranges[chrom][bin] = [] + self.ranges[chrom][bin].insert(0, (start, end, strand)) + + if (end / 100000) != bin: + for bin in xrange(bin + 1, (end / 100000) + 1): + if not bin in self.ranges[chrom]: + self.ranges[chrom][bin] = [] + self.ranges[chrom][bin].insert(0, (start, end, strand)) + + def get_tag(self, chrom, strand, pos, ignore_strand=False): + ''' + returns (region, is_reverse_orientation) + ''' + if not chrom in self.ranges: + return None, False + bin = pos / 100000 + if not bin in self.ranges[chrom]: + return None, False + for start, end, r_strand in self.ranges[chrom][bin]: + if pos >= start and pos <= end: + if ignore_strand or strand == r_strand: + return self.name, False + return self.name, True + return None, False + + +class RegionTagger(object): + def __init__(self, gtf, valid_chroms=None, only_first_fragment=True): + self.regions = [] + self.counts = {} + self.only_first_fragment = only_first_fragment + + coding = RangeMatch('coding') + exons = RangeMatch('other-exon') + utr_5 = RangeMatch('utr-5') + utr_3 = RangeMatch('utr-3') + introns = RangeMatch('intron') + promoters = RangeMatch('promoter') + + for gene in gtf.genes: + if valid_chroms and not gene.chrom in valid_chroms: + continue + if gene.strand == '+': + promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start) + else: + promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000) + + for transcript in gene.transcripts: + if transcript.has_cds: + for start, end in transcript.cds: + coding.add_range(gene.chrom, gene.strand, start, end) + + # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS + for s, e in transcript.utr_5: + utr_5.add_range(gene.chrom, gene.strand, s, e) + for s, e in transcript.utr_3: + utr_3.add_range(gene.chrom, gene.strand, s, e) + + last_end = None + for start, end in transcript.exons: + if last_end: + introns.add_range(gene.chrom, gene.strand, last_end, start) + exons.add_range(gene.chrom, gene.strand, start, end) + last_end = end + + + self.regions.append(coding) + self.regions.append(utr_5) + self.regions.append(utr_3) + self.regions.append(exons) + self.regions.append(introns) + self.regions.append(promoters) + + self.counts['coding'] = 0 + self.counts['coding-rev'] = 0 + self.counts['other-exon'] = 0 + self.counts['utr-5'] = 0 + self.counts['utr-3'] = 0 + self.counts['utr-5-rev'] = 0 + self.counts['utr-3-rev'] = 0 + self.counts['intron'] = 0 + self.counts['promoter'] = 0 + self.counts['other-exon-rev'] = 0 + self.counts['intron-rev'] = 0 + self.counts['promoter-rev'] = 0 + self.counts['junction'] = 0 + self.counts['intergenic'] = 0 + self.counts['mitochondrial'] = 0 + + def add_read(self, read, chrom): + if read.is_unmapped: + return + + if self.only_first_fragment and read.is_paired and not read.is_read1: + return + + tag = None + is_rev = False + + strand = '-' if read.is_reverse else '+' + + if chrom == 'chrM': + tag = 'mitochondrial' + + if not tag: + for op, length in read.cigar: + if op == 3: + tag = 'junction' + break + + if not tag: + for region in self.regions: + tag, is_rev = region.get_tag(chrom, strand, read.pos) + if tag: + break + + if not tag: + tag = 'intergenic' + + if tag: + if is_rev: + self.counts['%s-rev' % tag] += 1 + else: + self.counts[tag] += 1 + + return tag + + def tag_region(self, chrom, start, end, strand): + tag = None + is_rev = False + + if chrom == 'chrM' or chrom == 'M': + tag = 'mitochondrial' + + if not tag: + for region in self.regions: + tag, is_rev = region.get_tag(chrom, strand, start) + if is_rev: + tag = '%s-rev' % tag + + if start != end: + endtag, is_rev = region.get_tag(chrom, strand, end) + if is_rev: + endtag = '%s-rev' % endtag + + if tag and endtag and endtag != tag: + tag = '%s/%s' % (tag, endtag) + + if not tag: + tag = 'intergenic' \ No newline at end of file diff -r 000000000000 -r 4e4e4093d65d ngsutils/support/stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsutils/support/stats.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,161 @@ +''' +various statistical tests and methods... +''' +import math +from ngsutils.support import memoize + +def median(vals): + ''' + >>> median([1,2,3]) + 2 + >>> median([1,2,3,4]) + 2.5 + ''' + vals.sort() + + if len(vals) % 2 == 1: + return vals[len(vals) / 2] + else: + a = vals[(len(vals) / 2) - 1] + b = vals[(len(vals) / 2)] + return float(a + b) / 2 + + +def mean_stdev(l): + ''' + >>> mean_stdev([1,2,2,2]) + (1.75, 0.5) + >>> mean_stdev([2,2,2,2]) + (2.0, 0.0) + ''' + + acc = 0 + for el in l: + acc += el + + mean = float(acc) / len(l) + acc = 0 + for el in l: + acc += (el - mean) ** 2 + + if len(l) > 2: + stdev = math.sqrt(float(acc) / (len(l) - 1)) + else: + stdev = 0.0 + + return (mean, stdev) + + +def counts_median(d): + ''' + Calculate the median from counts stored in a dictionary + >>> counts_median({ 1: 4, 2: 1, 3: 4 }) + 2 + >>> counts_median({ 1: 4, 3: 4 }) + 2 + + ''' + count = 0 + for k in d: + count += d[k] + + if count == 0: + return 0 + + acc = 0.0 + last = 0 + for k in sorted(d): + if last: + return (last + k) / 2 + acc += d[k] + if acc / count == 0.5: + last = k + elif acc / count > 0.5: + return k + + +def counts_mean_stdev(d): + ''' + + calc mean / stdev when data is stored as counts in a dictionary + + Ex: + { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3] + + >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 }) + (2.0, 1.0) + + ''' + + acc = 0 + count = 0 + for k in d: + acc += k * d[k] + count += d[k] + + mean = float(acc) / count + + acc = 0 + for k in d: + acc += (((k - mean) ** 2) * d[k]) + + if count > 2: + stdev = math.sqrt(float(acc) / (count - 1)) + else: + stdev = 0.0 + + return (mean, stdev) + +@memoize +def poisson_prob(x, mean): + ''' + Return the probability that you could get x counts in + a Poisson test with a mean value. + + prob(x) = sum(i=1..x){poisson(i)} + + >>> poisson_prob(6,10) + 0.1300960209527205 + >>> poisson_prob(8,10) + 0.33277427882095645 + ''' + acc = 0.0 + for i in xrange(1, x+1): + acc += poisson_func(i, mean) + return acc + +@memoize +def poisson_func(mu, lambd): + ''' + This is the Poisson distribution function + + p(mu) = (lambda^mu * e^(-lambda)) / (mu!) + + mu is a count + lambd is the mean + + >>> poisson_func(1,10) + 0.00045399929762484856 + >>> poisson_func(2,10) + 0.0022699964881242427 + >>> poisson_func(3,10) + 0.007566654960414142 + ''' + return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu) + + +@memoize +def _factorial(x): + ''' + >>> _factorial(1) + 1 + >>> _factorial(2) + 2 + >>> _factorial(3) + 6 + ''' + return math.factorial(x) + +if __name__ == '__main__': + import doctest + doctest.testmod() diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_input1.bam Binary file test-data/ngsutils_bam_filter_input1.bam has changed diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result1.bam Binary file test-data/ngsutils_bam_filter_result1.bam has changed diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result2.bam Binary file test-data/ngsutils_bam_filter_result2.bam has changed diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result3.bam Binary file test-data/ngsutils_bam_filter_result3.bam has changed diff -r 000000000000 -r 4e4e4093d65d test-data/ngsutils_bam_filter_result4.bam Binary file test-data/ngsutils_bam_filter_result4.bam has changed diff -r 000000000000 -r 4e4e4093d65d tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,6 @@ + + + + + +