Mercurial > repos > iuc > ngsutils_bam_filter
diff filter.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.py Wed Nov 11 13:04:07 2015 -0500 @@ -0,0 +1,986 @@ +#!/usr/bin/env python +## category General +## desc Removes reads from a BAM file based on criteria +""" +Removes reads from a BAM file based on criteria + +Given a BAM file, this script will only allow reads that meet filtering +criteria to be written to output. The output is another BAM file with the +reads not matching the criteria removed. + +Note: this does not adjust tag values reflecting any filtering. (for example: + if a read mapped to two locations (IH:i:2), and one was removed by + filtering, the IH:i tag would still read IH:i:2). + +Currently, the available filters are: + -minlen val Remove reads that are smaller than {val} + -maxlen val Remove reads that are larger than {val} + -mapped Keep only mapped reads + -unmapped Keep only unmapped reads + -properpair Keep only properly paired reads (both mapped, + correct orientation, flag set in BAM) + -noproperpair Keep only not-properly paired reads + + -mask bitmask Remove reads that match the mask (base 10/hex) + -uniq {length} Remove reads that are have the same sequence + Note: BAM file should be sorted + (up to an optional length) + -uniq_start Remove reads that start at the same position + Note: BAM file should be sorted + (Use only for low-coverage samples) + + -mismatch num # mismatches or indels + indel always counts as 1 regardless of length + (requires NM tag in reads) + + -mismatch_dbsnp num dbsnp.txt.bgz + # mismatches or indels - not in dbSNP. + Variations are called using the MD tag. + Variations that are found in the dbSNP list are + not counted as mismatches. The dbSNP list is a + Tabix-indexed dump of dbSNP (from UCSC Genome + Browser). Indels in dbSNP are also counted. + Adds a 'ZS:i' tag with the number of found SNPs + in the read. + (requires NM and MD tags) + + Example command for indexing: + ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0 + + -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches + directly in a reference FASTA file + (use if NM tag not present) + + -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz + # mismatches or indels - looks up mismatches + directly from a reference FASTA file. (See + -mismatch_dbsnp for dbSNP matching) + (use if NM or MD tag not present) + + -nosecondary Remove reads that have the 0x100 flag set + -noqcfail Remove reads that have the 0x200 flag set + -nopcrdup Remove reads that have the 0x400 flag set + + + -exclude ref:start-end Remove reads in this region (1-based start) + -excludebed file.bed {nostrand} + Remove reads that are in any of the regions + from the given BED file. If 'nostrand' is given, + strand information from the BED file is ignored. + + -include ref:start-end Remove reads NOT in the region (can only be one) + -includebed file.bed {nostrand} + Remove reads that are NOT any of the regions + from the given BED file. If 'nostrand' is given, + strand information from the BED file is ignored. + + Note: If this is a large dataset, use + "bamutils extract" instead. + + -includeref refname Exclude reads NOT mapped to a reference + -excluderef refname Exclude reads mapped to a particular reference + (e.g. chrM, or _dup chromosomes) + + -whitelist fname Remove reads that aren't on this list (by name) + -blacklist fname Remove reads that are on this list (by name) + These lists can be whitespace-delimited with + the read name as the first column. + -maximum_mismatch_ratio val + Filter by maximum mismatch ratio (fraction of length) + + -eq tag_name value + -lt tag_name value + -lte tag_name value + -gt tag_name value + -gte tag_name value + + As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ + value will be used. + +Common tags to filter by: + AS Alignment score + IH Number of alignments + NM Edit distance (each indel counts as many as its length) + + MAPQ Mapping quality (defined as part of SAM spec) + + The tag type (:i, :f, :Z) is optional. + +""" + +import os +import sys +import pysam +from ngsutils.bam import bam_iter +from ngsutils.support.dbsnp import DBSNP +from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations +from ngsutils.bed import BedFile + + +def usage(): + print __doc__ + print """ +Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria... + +Options: + -failed fname A text file containing the read names of all reads + that were removed with filtering + +Example: +bamutils filter filename.bam output.bam -mapped -gte AS:i 1000 + +This will remove all unmapped reads, as well as any reads that have an AS:i +value less than 1000. +""" + sys.exit(1) + + +class Unique(object): + def __init__(self, length=None): + if length: + self.length = int(length) + else: + self.length = None + + self.last_pos = None + self.pos_reads = set() + + def __repr__(self): + return "uniq" + + def filter(self, bam, read): + if self.last_pos != (read.tid, read.pos): + self.last_pos = (read.tid, read.pos) + self.pos_reads = set() + + if read.is_reverse: + seq = read.seq[::-1] # ignore revcomp for now, it isn't needed, just need to compare in the proper order + else: + seq = read.seq + + if self.length: + seq = seq[:self.length] + + if seq in self.pos_reads: + return False + + self.pos_reads.add(seq) + return True + + def close(self): + pass + + +class UniqueStart(object): + def __init__(self): + self.last_tid = None + self.last_fwd_pos = -1 + self.rev_pos_list = None + + def __repr__(self): + return "uniq_start" + + def filter(self, bam, read): + if self.last_tid is None or self.last_tid != read.tid: + self.last_tid = read.tid + self.rev_pos = set() + self.last_fwd_pos = -1 + self.last_rev_pos = -1 + + if read.is_reverse: + # check reverse reads from their start (3' aend) + # these aren't necessarily in the correct + # order in the file, so we have to track them in a set + + start_pos = read.aend + + # clean up hash if necesary + # Allow up to 100k over, to balance cleaning up the rev_pos hash + # and memory + if read.pos > (self.last_rev_pos + 100000): + self.last_rev_pos = start_pos + del_list = [] + for k in self.rev_pos: + if k < read.pos: + del_list.append(k) + + for k in del_list: + self.rev_pos.remove(k) + + if not start_pos in self.rev_pos: + self.rev_pos.add(start_pos) + return True + return False + else: + if read.pos != self.last_fwd_pos: + self.last_fwd_pos = read.pos + return True + return False + + # if self.last_pos != (read.tid, read.pos): + # self.last_pos = (read.tid, read.pos) + # return True + # return False + + def close(self): + pass + + +class Blacklist(object): + def __init__(self, fname): + self.fname = fname + self.notallowed = set() + with open(fname) as f: + for line in f: + self.notallowed.add(line.strip().split()[0]) + + def filter(self, bam, read): + return read.qname not in self.notallowed + + def __repr__(self): + return 'Blacklist: %s' % (self.fname) + + def close(self): + pass + + +class Whitelist(object): + def __init__(self, fname): + self.fname = fname + self.allowed = set() + with open(fname) as f: + for line in f: + self.allowed.add(line.strip().split()[0]) + + def filter(self, bam, read): + return read.qname in self.allowed + + def __repr__(self): + return 'Whitelist: %s' % (self.fname) + + def close(self): + pass + + +class IncludeRegion(object): + _excludes = [] + _last = None + + def __init__(self, region): + IncludeRegion._excludes.append(ExcludeRegion(region)) + # self.excl = ExcludeRegion(region) + + def filter(self, bam, read): + if read == IncludeRegion._last: + return True + + IncludeRegion._last = read + + for excl in IncludeRegion._excludes: + if not excl.filter(bam, read): + return True + return False + + # return not self.excl.filter(bam,read) + def __repr__(self): + return 'Including: %s' % (self.excl.region) + + def close(self): + pass + + +class IncludeBED(object): + def __init__(self, fname, nostrand=None): + self.excl = ExcludeBED(fname, nostrand) + + def filter(self, bam, read): + return not self.excl.filter(bam, read) + + def __repr__(self): + return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '') + + def close(self): + pass + + +class ExcludeRegion(object): + def __init__(self, region): + self.region = region + spl = region.split(':') + self.chrom = spl[0] + se = [int(x) for x in spl[1].split('-')] + self.start = se[0] - 1 + self.end = se[1] + + def filter(self, bam, read): + if not read.is_unmapped: + if bam.getrname(read.tid) == self.chrom: + if self.start <= read.pos <= self.end: + return False + if self.start <= read.aend <= self.end: + return False + return True + + def __repr__(self): + return 'Excluding: %s' % (self.region) + + def close(self): + pass + + +class ExcludeRef(object): + def __init__(self, ref): + self.ref = ref + + def filter(self, bam, read): + if not read.is_unmapped: + if bam.getrname(read.tid) == self.ref: + return False + return True + + def __repr__(self): + return 'Excluding: %s' % (self.ref) + + def close(self): + pass + +class IncludeRef(object): + def __init__(self, ref): + self.ref = ref + + def filter(self, bam, read): + if not read.is_unmapped: + if bam.getrname(read.tid) == self.ref: + return True + return False + + def __repr__(self): + return 'Including: %s' % (self.ref) + + def close(self): + pass + + +class ExcludeBED(object): + def __init__(self, fname, nostrand=None): + self.regions = {} # store BED regions as keyed bins (chrom, bin) + self.fname = fname + if nostrand == 'nostrand': + self.nostrand = True + else: + self.nostrand = False + + self.bed = BedFile(fname) + # with open(fname) as f: + # for line in f: + # if not line: + # continue + # if line[0] == '#': + # continue + # cols = line.strip().split('\t') + + # chrom = cols[0] + # start = int(cols[1]) + # end = int(cols[2]) + # if self.nostrand: + # strand = '?' + # else: + # strand = cols[5] + + # startbin = start / 100000 + # endbin = end / 100000 + + # for bin in xrange(startbin, endbin + 1): + # if not (chrom, bin) in self.regions: + # self.regions[(chrom, bin)] = [] + # self.regions[(chrom, bin)].append((start, end, strand)) + + def filter(self, bam, read): + if not read.is_unmapped: + if self.nostrand: + strand = None + elif read.is_reverse: + strand = '-' + else: + strand = '+' + + for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand): + # region found, exclude read + return False + return True + + # bin = read.pos / 100000 + # ref = bam.getrname(read.tid) + + # if not (ref, bin) in self.regions: + # return True + + # for start, end, strand in self.regions[(ref, bin)]: + # if not self.nostrand: + # if strand == '+' and read.is_reverse: + # continue + # if strand == '-' and not read.is_reverse: + # continue + # if start <= read.pos <= end: + # return False + # if start <= read.aend <= end: + # return False + # return True + + def __repr__(self): + return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '') + + def close(self): + pass + + +class Mismatch(object): + def __init__(self, num): + self.num = int(num) + + def filter(self, bam, read): + if read.is_unmapped: + return False + + if read_calc_mismatches(read) > self.num: + return False + + return True + + def __repr__(self): + return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es') + + def close(self): + pass + + +class MismatchRef(object): + def __init__(self, num, refname): + self.num = int(num) + self.refname = refname + + if not os.path.exists('%s.fai' % refname): + pysam.faidx(refname) + + self.ref = pysam.Fastafile(refname) + + def filter(self, bam, read): + if read.is_unmapped: + return False + + chrom = bam.getrname(read.tid) + if read_calc_mismatches_ref(self.ref, read, chrom) > self.num: + return False + + return True + + def __repr__(self): + return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname)) + + def close(self): + self.ref.close() + + +class MismatchDbSNP(object): + def __init__(self, num, fname, verbose=None): + sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n') + + self.num = int(num) + self.fname = fname + self.dbsnp = DBSNP(fname) + if verbose == 'verbose': + self.verbose = True + else: + self.verbose = False + + def filter(self, bam, read): + if read.is_unmapped: + return False + + if read_calc_mismatches(read) <= self.num: + return True + + chrom = bam.getrname(read.tid) + + mm = 0 + snps = 0 + + for op, pos, seq in read_calc_variations(read): + if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose): + mm += 1 + else: + snps += 1 + + if mm > self.num: + return False + + if snps: + read.tags = read.tags + [('ZS', snps)] + + return True + + def __repr__(self): + return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname)) + + def close(self): + self.dbsnp.close() + + +class MismatchRefDbSNP(object): + def __init__(self, num, refname, dbsnpname): + sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n') + self.num = int(num) + self.refname = refname + self.dbsnp = DBSNP(dbsnpname) + + if not os.path.exists('%s.fai' % refname): + pysam.faidx(refname) + + self.ref = pysam.Fastafile(refname) + + def filter(self, bam, read): + if read.is_unmapped: + return False + + chrom = bam.getrname(read.tid) + + mm = 0 + snps = 0 + + for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom): + if not self.dbsnp.is_valid_variation(chrom, op, pos, seq): + mm += 1 + else: + snps += 1 + + if mm > self.num: + return False + + if snps: + read.tags = read.tags + [('ZS', snps)] + + return True + + def __repr__(self): + return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname)) + + def close(self): + self.ref.close() + self.dbsnp.close() + + +class Mapped(object): + def __init__(self): + pass + + def filter(self, bam, read): + if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): + return False + elif read.is_unmapped: + return False + return True + + def __repr__(self): + return 'is mapped' + + def close(self): + pass + + +class Unmapped(object): + def __init__(self): + pass + + def filter(self, bam, read): + if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): + return True + elif read.is_unmapped: + return True + return False + + def __repr__(self): + return 'is unmapped' + + def close(self): + pass + + +class ProperPair(object): + def __init__(self): + pass + + def filter(self, bam, read): + if not read.is_paired: + return False + + if read.is_unmapped or read.mate_is_unmapped: + return False + + if read.is_reverse == read.mate_is_reverse: + return False + + return read.is_proper_pair + + def __repr__(self): + return 'proper pair' + + def close(self): + pass + + +class NoProperPair(object): + def __init__(self): + self.proper = ProperPair() + pass + + def filter(self, bam, read): + return not self.proper.filter(bam, read) + + def __repr__(self): + return 'not proper pairs' + + def close(self): + pass + + +class MaskFlag(object): + def __init__(self, value): + if type(value) == type(1): + self.flag = value + else: + if value[0:2] == '0x': + self.flag = int(value, 16) + else: + self.flag = int(value) + + def __repr__(self): + return "Doesn't match flag: %s" % self.flag + + def filter(self, bam, read): + return (read.flag & self.flag) == 0 + + def close(self): + pass + + +class SecondaryFlag(object): + def __repr__(self): + return "no 0x100 (secondary) flag" + + def filter(self, bam, read): + return not read.is_secondary + + def close(self): + pass + + +class ReadMinLength(object): + def __init__(self, minval): + self.minval = int(minval) + + def __repr__(self): + return "read length min: %s" % self.minval + + def filter(self, bam, read): + return len(read.seq) >= self.minval + + def close(self): + pass + + +class ReadMaxLength(object): + def __init__(self, val): + self.val = int(val) + + def __repr__(self): + return "read length max: %s" % self.val + + def filter(self, bam, read): + return len(read.seq) <= self.val + + def close(self): + pass + + +class MaximumMismatchRatio(object): + def __init__(self, ratio): + self.ratio = float(ratio) + + def __repr__(self): + return "maximum mismatch ratio: %s" % self.val + + def filter(self, bam, read): + return read_calc_mismatches(read) <= self.ratio*len(read.seq) + + def close(self): + pass + + +class QCFailFlag(object): + def __repr__(self): + return "no 0x200 (qcfail) flag" + + def filter(self, bam, read): + return not read.is_qcfail + + def close(self): + pass + + +class PCRDupFlag(object): + def __repr__(self): + return "no 0x400 (pcrdup) flag" + + def filter(self, bam, read): + return not read.is_duplicate + + def close(self): + pass + + +class _TagCompare(object): + def __init__(self, tag, value): + self.args = '%s %s' % (tag, value) + + if ':' in tag: + self.tag = tag.split(':')[0] + tagtype = tag.split(':')[1] + if tagtype == 'i': + self.value = int(value) + elif tagtype == 'f': + self.value = float(value) + elif tagtype == 'H': + self.value = int(value) # not sure about this + else: + self.value = value + else: + self.tag = tag + + # guess at type... + try: + self.value = int(value) + except: + try: + self.value = float(value) + except: + self.value = value + + def get_value(self, read): + if self.tag == 'MAPQ': + return read.mapq + + for name, value in read.tags: + if name == self.tag: + return value + + return None + + def __repr__(self): + return "%s %s %s" % (self.tag, self.__class__.op, self.value) + + def close(self): + pass + + +class TagLessThan(_TagCompare): + op = '<' + + def filter(self, bam, read): + if self.get_value(read) < self.value: + return True + return False + + +class TagLessThanEqual(_TagCompare): + op = '<=' + + def filter(self, bam, read): + if self.get_value(read) <= self.value: + return True + return False + + +class TagGreaterThan(_TagCompare): + op = '>' + + def filter(self, bam, read): + if self.get_value(read) > self.value: + return True + return False + + +class TagGreaterThanEqual(_TagCompare): + op = '>=' + + def filter(self, bam, read): + if self.get_value(read) >= self.value: + return True + return False + + +class TagEqual(_TagCompare): + op = '=' + + def filter(self, bam, read): + if self.get_value(read) == self.value: + return True + return False + +_criteria = { + 'mapped': Mapped, + 'unmapped': Unmapped, + 'properpair': ProperPair, + 'noproperpair': NoProperPair, + 'noqcfail': QCFailFlag, + 'nosecondary': SecondaryFlag, + 'nopcrdup': PCRDupFlag, + 'mask': MaskFlag, + 'lt': TagLessThan, + 'gt': TagGreaterThan, + 'lte': TagLessThanEqual, + 'gte': TagGreaterThanEqual, + 'eq': TagEqual, + 'mismatch': Mismatch, + 'mismatch_ref': MismatchRef, + 'mismatch_dbsnp': MismatchDbSNP, + 'mismatch_ref_dbsnp': MismatchRefDbSNP, + 'exclude': ExcludeRegion, + 'excludebed': ExcludeBED, + 'excluderef': ExcludeRef, + 'includeref': IncludeRef, + 'include': IncludeRegion, + 'includebed': IncludeBED, + 'whitelist': Whitelist, + 'blacklist': Blacklist, + 'length': ReadMinLength, # deprecated + 'minlen': ReadMinLength, + 'maxlen': ReadMaxLength, + 'uniq': Unique, + 'maximum_mismatch_ratio': MaximumMismatchRatio +} + + +def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False): + if verbose: + sys.stderr.write('Input file : %s\n' % infile) + sys.stderr.write('Output file : %s\n' % outfile) + if failedfile: + sys.stderr.write('Failed reads: %s\n' % failedfile) + sys.stderr.write('Criteria:\n') + for criterion in criteria: + sys.stderr.write(' %s\n' % criterion) + + sys.stderr.write('\n') + + bamfile = pysam.Samfile(infile, "rb") + outfile = pysam.Samfile(outfile, "wb", template=bamfile) + + if failedfile: + failed_out = open(failedfile, 'w') + else: + failed_out = None + + passed = 0 + failed = 0 + + def _callback(read): + return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed) + + for read in bam_iter(bamfile, quiet=True): + p = True + + for criterion in criteria: + if not criterion.filter(bamfile, read): + p = False + failed += 1 + if failed_out: + failed_out.write('%s\t%s\n' % (read.qname, criterion)) + #outfile.write(read_to_unmapped(read)) + break + if p: + passed += 1 + outfile.write(read) + + bamfile.close() + outfile.close() + if failed_out: + failed_out.close() + sys.stdout.write("%s kept\n%s failed\n" % (passed, failed)) + + for criterion in criteria: + criterion.close() + + +def read_to_unmapped(read): + ''' + Example unmapped read + 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4 + + Example mapped read: + 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2---------------------------- + + ''' + # TODO: rewrite read properties to unmapped state + # if colorspace: convert CS to NT directly + # remove excess tags + + read.is_unmapped = True + read.tid = None + read.pos = 0 + read.mapq = 0 + return read + +if __name__ == '__main__': + infile = None + outfile = None + failed = None + criteria = [] + + crit_args = [] + last = None + verbose = False + fail = False + + for arg in sys.argv[1:]: + if last == '-failed': + failed = arg + last = None + elif arg == '-h': + usage() + elif arg == '-failed': + last = arg + elif arg == '-v': + verbose = True + elif not infile and os.path.exists(arg): + infile = arg + elif not outfile: + outfile = arg + elif arg[0] == '-': + if not arg[1:] in _criteria: + print "Unknown criterion: %s" % arg + fail = True + if crit_args: + criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) + crit_args = [arg, ] + elif crit_args: + crit_args.append(arg) + else: + print "Unknown argument: %s" % arg + fail = True + + if not fail and crit_args: + criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) + + if fail or not infile or not outfile or not criteria: + if not infile and not outfile and not criteria: + usage() + + if not infile: + print "Missing: input bamfile" + if not outfile: + print "Missing: output bamfile" + if not criteria: + print "Missing: filtering criteria" + usage() + else: + bam_filter(infile, outfile, criteria, failed, verbose)