Mercurial > repos > iuc > ngsutils_bam_filter
comparison filter.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
| author | iuc |
|---|---|
| date | Wed, 11 Nov 2015 13:04:07 -0500 |
| parents | |
| children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4e4e4093d65d |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ## category General | |
| 3 ## desc Removes reads from a BAM file based on criteria | |
| 4 """ | |
| 5 Removes reads from a BAM file based on criteria | |
| 6 | |
| 7 Given a BAM file, this script will only allow reads that meet filtering | |
| 8 criteria to be written to output. The output is another BAM file with the | |
| 9 reads not matching the criteria removed. | |
| 10 | |
| 11 Note: this does not adjust tag values reflecting any filtering. (for example: | |
| 12 if a read mapped to two locations (IH:i:2), and one was removed by | |
| 13 filtering, the IH:i tag would still read IH:i:2). | |
| 14 | |
| 15 Currently, the available filters are: | |
| 16 -minlen val Remove reads that are smaller than {val} | |
| 17 -maxlen val Remove reads that are larger than {val} | |
| 18 -mapped Keep only mapped reads | |
| 19 -unmapped Keep only unmapped reads | |
| 20 -properpair Keep only properly paired reads (both mapped, | |
| 21 correct orientation, flag set in BAM) | |
| 22 -noproperpair Keep only not-properly paired reads | |
| 23 | |
| 24 -mask bitmask Remove reads that match the mask (base 10/hex) | |
| 25 -uniq {length} Remove reads that are have the same sequence | |
| 26 Note: BAM file should be sorted | |
| 27 (up to an optional length) | |
| 28 -uniq_start Remove reads that start at the same position | |
| 29 Note: BAM file should be sorted | |
| 30 (Use only for low-coverage samples) | |
| 31 | |
| 32 -mismatch num # mismatches or indels | |
| 33 indel always counts as 1 regardless of length | |
| 34 (requires NM tag in reads) | |
| 35 | |
| 36 -mismatch_dbsnp num dbsnp.txt.bgz | |
| 37 # mismatches or indels - not in dbSNP. | |
| 38 Variations are called using the MD tag. | |
| 39 Variations that are found in the dbSNP list are | |
| 40 not counted as mismatches. The dbSNP list is a | |
| 41 Tabix-indexed dump of dbSNP (from UCSC Genome | |
| 42 Browser). Indels in dbSNP are also counted. | |
| 43 Adds a 'ZS:i' tag with the number of found SNPs | |
| 44 in the read. | |
| 45 (requires NM and MD tags) | |
| 46 | |
| 47 Example command for indexing: | |
| 48 ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0 | |
| 49 | |
| 50 -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches | |
| 51 directly in a reference FASTA file | |
| 52 (use if NM tag not present) | |
| 53 | |
| 54 -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz | |
| 55 # mismatches or indels - looks up mismatches | |
| 56 directly from a reference FASTA file. (See | |
| 57 -mismatch_dbsnp for dbSNP matching) | |
| 58 (use if NM or MD tag not present) | |
| 59 | |
| 60 -nosecondary Remove reads that have the 0x100 flag set | |
| 61 -noqcfail Remove reads that have the 0x200 flag set | |
| 62 -nopcrdup Remove reads that have the 0x400 flag set | |
| 63 | |
| 64 | |
| 65 -exclude ref:start-end Remove reads in this region (1-based start) | |
| 66 -excludebed file.bed {nostrand} | |
| 67 Remove reads that are in any of the regions | |
| 68 from the given BED file. If 'nostrand' is given, | |
| 69 strand information from the BED file is ignored. | |
| 70 | |
| 71 -include ref:start-end Remove reads NOT in the region (can only be one) | |
| 72 -includebed file.bed {nostrand} | |
| 73 Remove reads that are NOT any of the regions | |
| 74 from the given BED file. If 'nostrand' is given, | |
| 75 strand information from the BED file is ignored. | |
| 76 | |
| 77 Note: If this is a large dataset, use | |
| 78 "bamutils extract" instead. | |
| 79 | |
| 80 -includeref refname Exclude reads NOT mapped to a reference | |
| 81 -excluderef refname Exclude reads mapped to a particular reference | |
| 82 (e.g. chrM, or _dup chromosomes) | |
| 83 | |
| 84 -whitelist fname Remove reads that aren't on this list (by name) | |
| 85 -blacklist fname Remove reads that are on this list (by name) | |
| 86 These lists can be whitespace-delimited with | |
| 87 the read name as the first column. | |
| 88 -maximum_mismatch_ratio val | |
| 89 Filter by maximum mismatch ratio (fraction of length) | |
| 90 | |
| 91 -eq tag_name value | |
| 92 -lt tag_name value | |
| 93 -lte tag_name value | |
| 94 -gt tag_name value | |
| 95 -gte tag_name value | |
| 96 | |
| 97 As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ | |
| 98 value will be used. | |
| 99 | |
| 100 Common tags to filter by: | |
| 101 AS Alignment score | |
| 102 IH Number of alignments | |
| 103 NM Edit distance (each indel counts as many as its length) | |
| 104 | |
| 105 MAPQ Mapping quality (defined as part of SAM spec) | |
| 106 | |
| 107 The tag type (:i, :f, :Z) is optional. | |
| 108 | |
| 109 """ | |
| 110 | |
| 111 import os | |
| 112 import sys | |
| 113 import pysam | |
| 114 from ngsutils.bam import bam_iter | |
| 115 from ngsutils.support.dbsnp import DBSNP | |
| 116 from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations | |
| 117 from ngsutils.bed import BedFile | |
| 118 | |
| 119 | |
| 120 def usage(): | |
| 121 print __doc__ | |
| 122 print """ | |
| 123 Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria... | |
| 124 | |
| 125 Options: | |
| 126 -failed fname A text file containing the read names of all reads | |
| 127 that were removed with filtering | |
| 128 | |
| 129 Example: | |
| 130 bamutils filter filename.bam output.bam -mapped -gte AS:i 1000 | |
| 131 | |
| 132 This will remove all unmapped reads, as well as any reads that have an AS:i | |
| 133 value less than 1000. | |
| 134 """ | |
| 135 sys.exit(1) | |
| 136 | |
| 137 | |
| 138 class Unique(object): | |
| 139 def __init__(self, length=None): | |
| 140 if length: | |
| 141 self.length = int(length) | |
| 142 else: | |
| 143 self.length = None | |
| 144 | |
| 145 self.last_pos = None | |
| 146 self.pos_reads = set() | |
| 147 | |
| 148 def __repr__(self): | |
| 149 return "uniq" | |
| 150 | |
| 151 def filter(self, bam, read): | |
| 152 if self.last_pos != (read.tid, read.pos): | |
| 153 self.last_pos = (read.tid, read.pos) | |
| 154 self.pos_reads = set() | |
| 155 | |
| 156 if read.is_reverse: | |
| 157 seq = read.seq[::-1] # ignore revcomp for now, it isn't needed, just need to compare in the proper order | |
| 158 else: | |
| 159 seq = read.seq | |
| 160 | |
| 161 if self.length: | |
| 162 seq = seq[:self.length] | |
| 163 | |
| 164 if seq in self.pos_reads: | |
| 165 return False | |
| 166 | |
| 167 self.pos_reads.add(seq) | |
| 168 return True | |
| 169 | |
| 170 def close(self): | |
| 171 pass | |
| 172 | |
| 173 | |
| 174 class UniqueStart(object): | |
| 175 def __init__(self): | |
| 176 self.last_tid = None | |
| 177 self.last_fwd_pos = -1 | |
| 178 self.rev_pos_list = None | |
| 179 | |
| 180 def __repr__(self): | |
| 181 return "uniq_start" | |
| 182 | |
| 183 def filter(self, bam, read): | |
| 184 if self.last_tid is None or self.last_tid != read.tid: | |
| 185 self.last_tid = read.tid | |
| 186 self.rev_pos = set() | |
| 187 self.last_fwd_pos = -1 | |
| 188 self.last_rev_pos = -1 | |
| 189 | |
| 190 if read.is_reverse: | |
| 191 # check reverse reads from their start (3' aend) | |
| 192 # these aren't necessarily in the correct | |
| 193 # order in the file, so we have to track them in a set | |
| 194 | |
| 195 start_pos = read.aend | |
| 196 | |
| 197 # clean up hash if necesary | |
| 198 # Allow up to 100k over, to balance cleaning up the rev_pos hash | |
| 199 # and memory | |
| 200 if read.pos > (self.last_rev_pos + 100000): | |
| 201 self.last_rev_pos = start_pos | |
| 202 del_list = [] | |
| 203 for k in self.rev_pos: | |
| 204 if k < read.pos: | |
| 205 del_list.append(k) | |
| 206 | |
| 207 for k in del_list: | |
| 208 self.rev_pos.remove(k) | |
| 209 | |
| 210 if not start_pos in self.rev_pos: | |
| 211 self.rev_pos.add(start_pos) | |
| 212 return True | |
| 213 return False | |
| 214 else: | |
| 215 if read.pos != self.last_fwd_pos: | |
| 216 self.last_fwd_pos = read.pos | |
| 217 return True | |
| 218 return False | |
| 219 | |
| 220 # if self.last_pos != (read.tid, read.pos): | |
| 221 # self.last_pos = (read.tid, read.pos) | |
| 222 # return True | |
| 223 # return False | |
| 224 | |
| 225 def close(self): | |
| 226 pass | |
| 227 | |
| 228 | |
| 229 class Blacklist(object): | |
| 230 def __init__(self, fname): | |
| 231 self.fname = fname | |
| 232 self.notallowed = set() | |
| 233 with open(fname) as f: | |
| 234 for line in f: | |
| 235 self.notallowed.add(line.strip().split()[0]) | |
| 236 | |
| 237 def filter(self, bam, read): | |
| 238 return read.qname not in self.notallowed | |
| 239 | |
| 240 def __repr__(self): | |
| 241 return 'Blacklist: %s' % (self.fname) | |
| 242 | |
| 243 def close(self): | |
| 244 pass | |
| 245 | |
| 246 | |
| 247 class Whitelist(object): | |
| 248 def __init__(self, fname): | |
| 249 self.fname = fname | |
| 250 self.allowed = set() | |
| 251 with open(fname) as f: | |
| 252 for line in f: | |
| 253 self.allowed.add(line.strip().split()[0]) | |
| 254 | |
| 255 def filter(self, bam, read): | |
| 256 return read.qname in self.allowed | |
| 257 | |
| 258 def __repr__(self): | |
| 259 return 'Whitelist: %s' % (self.fname) | |
| 260 | |
| 261 def close(self): | |
| 262 pass | |
| 263 | |
| 264 | |
| 265 class IncludeRegion(object): | |
| 266 _excludes = [] | |
| 267 _last = None | |
| 268 | |
| 269 def __init__(self, region): | |
| 270 IncludeRegion._excludes.append(ExcludeRegion(region)) | |
| 271 # self.excl = ExcludeRegion(region) | |
| 272 | |
| 273 def filter(self, bam, read): | |
| 274 if read == IncludeRegion._last: | |
| 275 return True | |
| 276 | |
| 277 IncludeRegion._last = read | |
| 278 | |
| 279 for excl in IncludeRegion._excludes: | |
| 280 if not excl.filter(bam, read): | |
| 281 return True | |
| 282 return False | |
| 283 | |
| 284 # return not self.excl.filter(bam,read) | |
| 285 def __repr__(self): | |
| 286 return 'Including: %s' % (self.excl.region) | |
| 287 | |
| 288 def close(self): | |
| 289 pass | |
| 290 | |
| 291 | |
| 292 class IncludeBED(object): | |
| 293 def __init__(self, fname, nostrand=None): | |
| 294 self.excl = ExcludeBED(fname, nostrand) | |
| 295 | |
| 296 def filter(self, bam, read): | |
| 297 return not self.excl.filter(bam, read) | |
| 298 | |
| 299 def __repr__(self): | |
| 300 return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '') | |
| 301 | |
| 302 def close(self): | |
| 303 pass | |
| 304 | |
| 305 | |
| 306 class ExcludeRegion(object): | |
| 307 def __init__(self, region): | |
| 308 self.region = region | |
| 309 spl = region.split(':') | |
| 310 self.chrom = spl[0] | |
| 311 se = [int(x) for x in spl[1].split('-')] | |
| 312 self.start = se[0] - 1 | |
| 313 self.end = se[1] | |
| 314 | |
| 315 def filter(self, bam, read): | |
| 316 if not read.is_unmapped: | |
| 317 if bam.getrname(read.tid) == self.chrom: | |
| 318 if self.start <= read.pos <= self.end: | |
| 319 return False | |
| 320 if self.start <= read.aend <= self.end: | |
| 321 return False | |
| 322 return True | |
| 323 | |
| 324 def __repr__(self): | |
| 325 return 'Excluding: %s' % (self.region) | |
| 326 | |
| 327 def close(self): | |
| 328 pass | |
| 329 | |
| 330 | |
| 331 class ExcludeRef(object): | |
| 332 def __init__(self, ref): | |
| 333 self.ref = ref | |
| 334 | |
| 335 def filter(self, bam, read): | |
| 336 if not read.is_unmapped: | |
| 337 if bam.getrname(read.tid) == self.ref: | |
| 338 return False | |
| 339 return True | |
| 340 | |
| 341 def __repr__(self): | |
| 342 return 'Excluding: %s' % (self.ref) | |
| 343 | |
| 344 def close(self): | |
| 345 pass | |
| 346 | |
| 347 class IncludeRef(object): | |
| 348 def __init__(self, ref): | |
| 349 self.ref = ref | |
| 350 | |
| 351 def filter(self, bam, read): | |
| 352 if not read.is_unmapped: | |
| 353 if bam.getrname(read.tid) == self.ref: | |
| 354 return True | |
| 355 return False | |
| 356 | |
| 357 def __repr__(self): | |
| 358 return 'Including: %s' % (self.ref) | |
| 359 | |
| 360 def close(self): | |
| 361 pass | |
| 362 | |
| 363 | |
| 364 class ExcludeBED(object): | |
| 365 def __init__(self, fname, nostrand=None): | |
| 366 self.regions = {} # store BED regions as keyed bins (chrom, bin) | |
| 367 self.fname = fname | |
| 368 if nostrand == 'nostrand': | |
| 369 self.nostrand = True | |
| 370 else: | |
| 371 self.nostrand = False | |
| 372 | |
| 373 self.bed = BedFile(fname) | |
| 374 # with open(fname) as f: | |
| 375 # for line in f: | |
| 376 # if not line: | |
| 377 # continue | |
| 378 # if line[0] == '#': | |
| 379 # continue | |
| 380 # cols = line.strip().split('\t') | |
| 381 | |
| 382 # chrom = cols[0] | |
| 383 # start = int(cols[1]) | |
| 384 # end = int(cols[2]) | |
| 385 # if self.nostrand: | |
| 386 # strand = '?' | |
| 387 # else: | |
| 388 # strand = cols[5] | |
| 389 | |
| 390 # startbin = start / 100000 | |
| 391 # endbin = end / 100000 | |
| 392 | |
| 393 # for bin in xrange(startbin, endbin + 1): | |
| 394 # if not (chrom, bin) in self.regions: | |
| 395 # self.regions[(chrom, bin)] = [] | |
| 396 # self.regions[(chrom, bin)].append((start, end, strand)) | |
| 397 | |
| 398 def filter(self, bam, read): | |
| 399 if not read.is_unmapped: | |
| 400 if self.nostrand: | |
| 401 strand = None | |
| 402 elif read.is_reverse: | |
| 403 strand = '-' | |
| 404 else: | |
| 405 strand = '+' | |
| 406 | |
| 407 for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand): | |
| 408 # region found, exclude read | |
| 409 return False | |
| 410 return True | |
| 411 | |
| 412 # bin = read.pos / 100000 | |
| 413 # ref = bam.getrname(read.tid) | |
| 414 | |
| 415 # if not (ref, bin) in self.regions: | |
| 416 # return True | |
| 417 | |
| 418 # for start, end, strand in self.regions[(ref, bin)]: | |
| 419 # if not self.nostrand: | |
| 420 # if strand == '+' and read.is_reverse: | |
| 421 # continue | |
| 422 # if strand == '-' and not read.is_reverse: | |
| 423 # continue | |
| 424 # if start <= read.pos <= end: | |
| 425 # return False | |
| 426 # if start <= read.aend <= end: | |
| 427 # return False | |
| 428 # return True | |
| 429 | |
| 430 def __repr__(self): | |
| 431 return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '') | |
| 432 | |
| 433 def close(self): | |
| 434 pass | |
| 435 | |
| 436 | |
| 437 class Mismatch(object): | |
| 438 def __init__(self, num): | |
| 439 self.num = int(num) | |
| 440 | |
| 441 def filter(self, bam, read): | |
| 442 if read.is_unmapped: | |
| 443 return False | |
| 444 | |
| 445 if read_calc_mismatches(read) > self.num: | |
| 446 return False | |
| 447 | |
| 448 return True | |
| 449 | |
| 450 def __repr__(self): | |
| 451 return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es') | |
| 452 | |
| 453 def close(self): | |
| 454 pass | |
| 455 | |
| 456 | |
| 457 class MismatchRef(object): | |
| 458 def __init__(self, num, refname): | |
| 459 self.num = int(num) | |
| 460 self.refname = refname | |
| 461 | |
| 462 if not os.path.exists('%s.fai' % refname): | |
| 463 pysam.faidx(refname) | |
| 464 | |
| 465 self.ref = pysam.Fastafile(refname) | |
| 466 | |
| 467 def filter(self, bam, read): | |
| 468 if read.is_unmapped: | |
| 469 return False | |
| 470 | |
| 471 chrom = bam.getrname(read.tid) | |
| 472 if read_calc_mismatches_ref(self.ref, read, chrom) > self.num: | |
| 473 return False | |
| 474 | |
| 475 return True | |
| 476 | |
| 477 def __repr__(self): | |
| 478 return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname)) | |
| 479 | |
| 480 def close(self): | |
| 481 self.ref.close() | |
| 482 | |
| 483 | |
| 484 class MismatchDbSNP(object): | |
| 485 def __init__(self, num, fname, verbose=None): | |
| 486 sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n') | |
| 487 | |
| 488 self.num = int(num) | |
| 489 self.fname = fname | |
| 490 self.dbsnp = DBSNP(fname) | |
| 491 if verbose == 'verbose': | |
| 492 self.verbose = True | |
| 493 else: | |
| 494 self.verbose = False | |
| 495 | |
| 496 def filter(self, bam, read): | |
| 497 if read.is_unmapped: | |
| 498 return False | |
| 499 | |
| 500 if read_calc_mismatches(read) <= self.num: | |
| 501 return True | |
| 502 | |
| 503 chrom = bam.getrname(read.tid) | |
| 504 | |
| 505 mm = 0 | |
| 506 snps = 0 | |
| 507 | |
| 508 for op, pos, seq in read_calc_variations(read): | |
| 509 if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose): | |
| 510 mm += 1 | |
| 511 else: | |
| 512 snps += 1 | |
| 513 | |
| 514 if mm > self.num: | |
| 515 return False | |
| 516 | |
| 517 if snps: | |
| 518 read.tags = read.tags + [('ZS', snps)] | |
| 519 | |
| 520 return True | |
| 521 | |
| 522 def __repr__(self): | |
| 523 return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname)) | |
| 524 | |
| 525 def close(self): | |
| 526 self.dbsnp.close() | |
| 527 | |
| 528 | |
| 529 class MismatchRefDbSNP(object): | |
| 530 def __init__(self, num, refname, dbsnpname): | |
| 531 sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n') | |
| 532 self.num = int(num) | |
| 533 self.refname = refname | |
| 534 self.dbsnp = DBSNP(dbsnpname) | |
| 535 | |
| 536 if not os.path.exists('%s.fai' % refname): | |
| 537 pysam.faidx(refname) | |
| 538 | |
| 539 self.ref = pysam.Fastafile(refname) | |
| 540 | |
| 541 def filter(self, bam, read): | |
| 542 if read.is_unmapped: | |
| 543 return False | |
| 544 | |
| 545 chrom = bam.getrname(read.tid) | |
| 546 | |
| 547 mm = 0 | |
| 548 snps = 0 | |
| 549 | |
| 550 for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom): | |
| 551 if not self.dbsnp.is_valid_variation(chrom, op, pos, seq): | |
| 552 mm += 1 | |
| 553 else: | |
| 554 snps += 1 | |
| 555 | |
| 556 if mm > self.num: | |
| 557 return False | |
| 558 | |
| 559 if snps: | |
| 560 read.tags = read.tags + [('ZS', snps)] | |
| 561 | |
| 562 return True | |
| 563 | |
| 564 def __repr__(self): | |
| 565 return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname)) | |
| 566 | |
| 567 def close(self): | |
| 568 self.ref.close() | |
| 569 self.dbsnp.close() | |
| 570 | |
| 571 | |
| 572 class Mapped(object): | |
| 573 def __init__(self): | |
| 574 pass | |
| 575 | |
| 576 def filter(self, bam, read): | |
| 577 if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): | |
| 578 return False | |
| 579 elif read.is_unmapped: | |
| 580 return False | |
| 581 return True | |
| 582 | |
| 583 def __repr__(self): | |
| 584 return 'is mapped' | |
| 585 | |
| 586 def close(self): | |
| 587 pass | |
| 588 | |
| 589 | |
| 590 class Unmapped(object): | |
| 591 def __init__(self): | |
| 592 pass | |
| 593 | |
| 594 def filter(self, bam, read): | |
| 595 if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): | |
| 596 return True | |
| 597 elif read.is_unmapped: | |
| 598 return True | |
| 599 return False | |
| 600 | |
| 601 def __repr__(self): | |
| 602 return 'is unmapped' | |
| 603 | |
| 604 def close(self): | |
| 605 pass | |
| 606 | |
| 607 | |
| 608 class ProperPair(object): | |
| 609 def __init__(self): | |
| 610 pass | |
| 611 | |
| 612 def filter(self, bam, read): | |
| 613 if not read.is_paired: | |
| 614 return False | |
| 615 | |
| 616 if read.is_unmapped or read.mate_is_unmapped: | |
| 617 return False | |
| 618 | |
| 619 if read.is_reverse == read.mate_is_reverse: | |
| 620 return False | |
| 621 | |
| 622 return read.is_proper_pair | |
| 623 | |
| 624 def __repr__(self): | |
| 625 return 'proper pair' | |
| 626 | |
| 627 def close(self): | |
| 628 pass | |
| 629 | |
| 630 | |
| 631 class NoProperPair(object): | |
| 632 def __init__(self): | |
| 633 self.proper = ProperPair() | |
| 634 pass | |
| 635 | |
| 636 def filter(self, bam, read): | |
| 637 return not self.proper.filter(bam, read) | |
| 638 | |
| 639 def __repr__(self): | |
| 640 return 'not proper pairs' | |
| 641 | |
| 642 def close(self): | |
| 643 pass | |
| 644 | |
| 645 | |
| 646 class MaskFlag(object): | |
| 647 def __init__(self, value): | |
| 648 if type(value) == type(1): | |
| 649 self.flag = value | |
| 650 else: | |
| 651 if value[0:2] == '0x': | |
| 652 self.flag = int(value, 16) | |
| 653 else: | |
| 654 self.flag = int(value) | |
| 655 | |
| 656 def __repr__(self): | |
| 657 return "Doesn't match flag: %s" % self.flag | |
| 658 | |
| 659 def filter(self, bam, read): | |
| 660 return (read.flag & self.flag) == 0 | |
| 661 | |
| 662 def close(self): | |
| 663 pass | |
| 664 | |
| 665 | |
| 666 class SecondaryFlag(object): | |
| 667 def __repr__(self): | |
| 668 return "no 0x100 (secondary) flag" | |
| 669 | |
| 670 def filter(self, bam, read): | |
| 671 return not read.is_secondary | |
| 672 | |
| 673 def close(self): | |
| 674 pass | |
| 675 | |
| 676 | |
| 677 class ReadMinLength(object): | |
| 678 def __init__(self, minval): | |
| 679 self.minval = int(minval) | |
| 680 | |
| 681 def __repr__(self): | |
| 682 return "read length min: %s" % self.minval | |
| 683 | |
| 684 def filter(self, bam, read): | |
| 685 return len(read.seq) >= self.minval | |
| 686 | |
| 687 def close(self): | |
| 688 pass | |
| 689 | |
| 690 | |
| 691 class ReadMaxLength(object): | |
| 692 def __init__(self, val): | |
| 693 self.val = int(val) | |
| 694 | |
| 695 def __repr__(self): | |
| 696 return "read length max: %s" % self.val | |
| 697 | |
| 698 def filter(self, bam, read): | |
| 699 return len(read.seq) <= self.val | |
| 700 | |
| 701 def close(self): | |
| 702 pass | |
| 703 | |
| 704 | |
| 705 class MaximumMismatchRatio(object): | |
| 706 def __init__(self, ratio): | |
| 707 self.ratio = float(ratio) | |
| 708 | |
| 709 def __repr__(self): | |
| 710 return "maximum mismatch ratio: %s" % self.val | |
| 711 | |
| 712 def filter(self, bam, read): | |
| 713 return read_calc_mismatches(read) <= self.ratio*len(read.seq) | |
| 714 | |
| 715 def close(self): | |
| 716 pass | |
| 717 | |
| 718 | |
| 719 class QCFailFlag(object): | |
| 720 def __repr__(self): | |
| 721 return "no 0x200 (qcfail) flag" | |
| 722 | |
| 723 def filter(self, bam, read): | |
| 724 return not read.is_qcfail | |
| 725 | |
| 726 def close(self): | |
| 727 pass | |
| 728 | |
| 729 | |
| 730 class PCRDupFlag(object): | |
| 731 def __repr__(self): | |
| 732 return "no 0x400 (pcrdup) flag" | |
| 733 | |
| 734 def filter(self, bam, read): | |
| 735 return not read.is_duplicate | |
| 736 | |
| 737 def close(self): | |
| 738 pass | |
| 739 | |
| 740 | |
| 741 class _TagCompare(object): | |
| 742 def __init__(self, tag, value): | |
| 743 self.args = '%s %s' % (tag, value) | |
| 744 | |
| 745 if ':' in tag: | |
| 746 self.tag = tag.split(':')[0] | |
| 747 tagtype = tag.split(':')[1] | |
| 748 if tagtype == 'i': | |
| 749 self.value = int(value) | |
| 750 elif tagtype == 'f': | |
| 751 self.value = float(value) | |
| 752 elif tagtype == 'H': | |
| 753 self.value = int(value) # not sure about this | |
| 754 else: | |
| 755 self.value = value | |
| 756 else: | |
| 757 self.tag = tag | |
| 758 | |
| 759 # guess at type... | |
| 760 try: | |
| 761 self.value = int(value) | |
| 762 except: | |
| 763 try: | |
| 764 self.value = float(value) | |
| 765 except: | |
| 766 self.value = value | |
| 767 | |
| 768 def get_value(self, read): | |
| 769 if self.tag == 'MAPQ': | |
| 770 return read.mapq | |
| 771 | |
| 772 for name, value in read.tags: | |
| 773 if name == self.tag: | |
| 774 return value | |
| 775 | |
| 776 return None | |
| 777 | |
| 778 def __repr__(self): | |
| 779 return "%s %s %s" % (self.tag, self.__class__.op, self.value) | |
| 780 | |
| 781 def close(self): | |
| 782 pass | |
| 783 | |
| 784 | |
| 785 class TagLessThan(_TagCompare): | |
| 786 op = '<' | |
| 787 | |
| 788 def filter(self, bam, read): | |
| 789 if self.get_value(read) < self.value: | |
| 790 return True | |
| 791 return False | |
| 792 | |
| 793 | |
| 794 class TagLessThanEqual(_TagCompare): | |
| 795 op = '<=' | |
| 796 | |
| 797 def filter(self, bam, read): | |
| 798 if self.get_value(read) <= self.value: | |
| 799 return True | |
| 800 return False | |
| 801 | |
| 802 | |
| 803 class TagGreaterThan(_TagCompare): | |
| 804 op = '>' | |
| 805 | |
| 806 def filter(self, bam, read): | |
| 807 if self.get_value(read) > self.value: | |
| 808 return True | |
| 809 return False | |
| 810 | |
| 811 | |
| 812 class TagGreaterThanEqual(_TagCompare): | |
| 813 op = '>=' | |
| 814 | |
| 815 def filter(self, bam, read): | |
| 816 if self.get_value(read) >= self.value: | |
| 817 return True | |
| 818 return False | |
| 819 | |
| 820 | |
| 821 class TagEqual(_TagCompare): | |
| 822 op = '=' | |
| 823 | |
| 824 def filter(self, bam, read): | |
| 825 if self.get_value(read) == self.value: | |
| 826 return True | |
| 827 return False | |
| 828 | |
| 829 _criteria = { | |
| 830 'mapped': Mapped, | |
| 831 'unmapped': Unmapped, | |
| 832 'properpair': ProperPair, | |
| 833 'noproperpair': NoProperPair, | |
| 834 'noqcfail': QCFailFlag, | |
| 835 'nosecondary': SecondaryFlag, | |
| 836 'nopcrdup': PCRDupFlag, | |
| 837 'mask': MaskFlag, | |
| 838 'lt': TagLessThan, | |
| 839 'gt': TagGreaterThan, | |
| 840 'lte': TagLessThanEqual, | |
| 841 'gte': TagGreaterThanEqual, | |
| 842 'eq': TagEqual, | |
| 843 'mismatch': Mismatch, | |
| 844 'mismatch_ref': MismatchRef, | |
| 845 'mismatch_dbsnp': MismatchDbSNP, | |
| 846 'mismatch_ref_dbsnp': MismatchRefDbSNP, | |
| 847 'exclude': ExcludeRegion, | |
| 848 'excludebed': ExcludeBED, | |
| 849 'excluderef': ExcludeRef, | |
| 850 'includeref': IncludeRef, | |
| 851 'include': IncludeRegion, | |
| 852 'includebed': IncludeBED, | |
| 853 'whitelist': Whitelist, | |
| 854 'blacklist': Blacklist, | |
| 855 'length': ReadMinLength, # deprecated | |
| 856 'minlen': ReadMinLength, | |
| 857 'maxlen': ReadMaxLength, | |
| 858 'uniq': Unique, | |
| 859 'maximum_mismatch_ratio': MaximumMismatchRatio | |
| 860 } | |
| 861 | |
| 862 | |
| 863 def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False): | |
| 864 if verbose: | |
| 865 sys.stderr.write('Input file : %s\n' % infile) | |
| 866 sys.stderr.write('Output file : %s\n' % outfile) | |
| 867 if failedfile: | |
| 868 sys.stderr.write('Failed reads: %s\n' % failedfile) | |
| 869 sys.stderr.write('Criteria:\n') | |
| 870 for criterion in criteria: | |
| 871 sys.stderr.write(' %s\n' % criterion) | |
| 872 | |
| 873 sys.stderr.write('\n') | |
| 874 | |
| 875 bamfile = pysam.Samfile(infile, "rb") | |
| 876 outfile = pysam.Samfile(outfile, "wb", template=bamfile) | |
| 877 | |
| 878 if failedfile: | |
| 879 failed_out = open(failedfile, 'w') | |
| 880 else: | |
| 881 failed_out = None | |
| 882 | |
| 883 passed = 0 | |
| 884 failed = 0 | |
| 885 | |
| 886 def _callback(read): | |
| 887 return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed) | |
| 888 | |
| 889 for read in bam_iter(bamfile, quiet=True): | |
| 890 p = True | |
| 891 | |
| 892 for criterion in criteria: | |
| 893 if not criterion.filter(bamfile, read): | |
| 894 p = False | |
| 895 failed += 1 | |
| 896 if failed_out: | |
| 897 failed_out.write('%s\t%s\n' % (read.qname, criterion)) | |
| 898 #outfile.write(read_to_unmapped(read)) | |
| 899 break | |
| 900 if p: | |
| 901 passed += 1 | |
| 902 outfile.write(read) | |
| 903 | |
| 904 bamfile.close() | |
| 905 outfile.close() | |
| 906 if failed_out: | |
| 907 failed_out.close() | |
| 908 sys.stdout.write("%s kept\n%s failed\n" % (passed, failed)) | |
| 909 | |
| 910 for criterion in criteria: | |
| 911 criterion.close() | |
| 912 | |
| 913 | |
| 914 def read_to_unmapped(read): | |
| 915 ''' | |
| 916 Example unmapped read | |
| 917 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4 | |
| 918 | |
| 919 Example mapped read: | |
| 920 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2---------------------------- | |
| 921 | |
| 922 ''' | |
| 923 # TODO: rewrite read properties to unmapped state | |
| 924 # if colorspace: convert CS to NT directly | |
| 925 # remove excess tags | |
| 926 | |
| 927 read.is_unmapped = True | |
| 928 read.tid = None | |
| 929 read.pos = 0 | |
| 930 read.mapq = 0 | |
| 931 return read | |
| 932 | |
| 933 if __name__ == '__main__': | |
| 934 infile = None | |
| 935 outfile = None | |
| 936 failed = None | |
| 937 criteria = [] | |
| 938 | |
| 939 crit_args = [] | |
| 940 last = None | |
| 941 verbose = False | |
| 942 fail = False | |
| 943 | |
| 944 for arg in sys.argv[1:]: | |
| 945 if last == '-failed': | |
| 946 failed = arg | |
| 947 last = None | |
| 948 elif arg == '-h': | |
| 949 usage() | |
| 950 elif arg == '-failed': | |
| 951 last = arg | |
| 952 elif arg == '-v': | |
| 953 verbose = True | |
| 954 elif not infile and os.path.exists(arg): | |
| 955 infile = arg | |
| 956 elif not outfile: | |
| 957 outfile = arg | |
| 958 elif arg[0] == '-': | |
| 959 if not arg[1:] in _criteria: | |
| 960 print "Unknown criterion: %s" % arg | |
| 961 fail = True | |
| 962 if crit_args: | |
| 963 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) | |
| 964 crit_args = [arg, ] | |
| 965 elif crit_args: | |
| 966 crit_args.append(arg) | |
| 967 else: | |
| 968 print "Unknown argument: %s" % arg | |
| 969 fail = True | |
| 970 | |
| 971 if not fail and crit_args: | |
| 972 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) | |
| 973 | |
| 974 if fail or not infile or not outfile or not criteria: | |
| 975 if not infile and not outfile and not criteria: | |
| 976 usage() | |
| 977 | |
| 978 if not infile: | |
| 979 print "Missing: input bamfile" | |
| 980 if not outfile: | |
| 981 print "Missing: output bamfile" | |
| 982 if not criteria: | |
| 983 print "Missing: filtering criteria" | |
| 984 usage() | |
| 985 else: | |
| 986 bam_filter(infile, outfile, criteria, failed, verbose) |
