Mercurial > repos > iuc > ngsutils_bam_filter
comparison filter.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 #!/usr/bin/env python | |
2 ## category General | |
3 ## desc Removes reads from a BAM file based on criteria | |
4 """ | |
5 Removes reads from a BAM file based on criteria | |
6 | |
7 Given a BAM file, this script will only allow reads that meet filtering | |
8 criteria to be written to output. The output is another BAM file with the | |
9 reads not matching the criteria removed. | |
10 | |
11 Note: this does not adjust tag values reflecting any filtering. (for example: | |
12 if a read mapped to two locations (IH:i:2), and one was removed by | |
13 filtering, the IH:i tag would still read IH:i:2). | |
14 | |
15 Currently, the available filters are: | |
16 -minlen val Remove reads that are smaller than {val} | |
17 -maxlen val Remove reads that are larger than {val} | |
18 -mapped Keep only mapped reads | |
19 -unmapped Keep only unmapped reads | |
20 -properpair Keep only properly paired reads (both mapped, | |
21 correct orientation, flag set in BAM) | |
22 -noproperpair Keep only not-properly paired reads | |
23 | |
24 -mask bitmask Remove reads that match the mask (base 10/hex) | |
25 -uniq {length} Remove reads that are have the same sequence | |
26 Note: BAM file should be sorted | |
27 (up to an optional length) | |
28 -uniq_start Remove reads that start at the same position | |
29 Note: BAM file should be sorted | |
30 (Use only for low-coverage samples) | |
31 | |
32 -mismatch num # mismatches or indels | |
33 indel always counts as 1 regardless of length | |
34 (requires NM tag in reads) | |
35 | |
36 -mismatch_dbsnp num dbsnp.txt.bgz | |
37 # mismatches or indels - not in dbSNP. | |
38 Variations are called using the MD tag. | |
39 Variations that are found in the dbSNP list are | |
40 not counted as mismatches. The dbSNP list is a | |
41 Tabix-indexed dump of dbSNP (from UCSC Genome | |
42 Browser). Indels in dbSNP are also counted. | |
43 Adds a 'ZS:i' tag with the number of found SNPs | |
44 in the read. | |
45 (requires NM and MD tags) | |
46 | |
47 Example command for indexing: | |
48 ngsutils tabixindex snp.txt.gz -s 2 -b 3 -e 4 -0 | |
49 | |
50 -mismatch_ref num ref.fa # mismatches or indel - looks up mismatches | |
51 directly in a reference FASTA file | |
52 (use if NM tag not present) | |
53 | |
54 -mismatch_ref_dbsnp num ref.fa dbsnp.txt.bgz | |
55 # mismatches or indels - looks up mismatches | |
56 directly from a reference FASTA file. (See | |
57 -mismatch_dbsnp for dbSNP matching) | |
58 (use if NM or MD tag not present) | |
59 | |
60 -nosecondary Remove reads that have the 0x100 flag set | |
61 -noqcfail Remove reads that have the 0x200 flag set | |
62 -nopcrdup Remove reads that have the 0x400 flag set | |
63 | |
64 | |
65 -exclude ref:start-end Remove reads in this region (1-based start) | |
66 -excludebed file.bed {nostrand} | |
67 Remove reads that are in any of the regions | |
68 from the given BED file. If 'nostrand' is given, | |
69 strand information from the BED file is ignored. | |
70 | |
71 -include ref:start-end Remove reads NOT in the region (can only be one) | |
72 -includebed file.bed {nostrand} | |
73 Remove reads that are NOT any of the regions | |
74 from the given BED file. If 'nostrand' is given, | |
75 strand information from the BED file is ignored. | |
76 | |
77 Note: If this is a large dataset, use | |
78 "bamutils extract" instead. | |
79 | |
80 -includeref refname Exclude reads NOT mapped to a reference | |
81 -excluderef refname Exclude reads mapped to a particular reference | |
82 (e.g. chrM, or _dup chromosomes) | |
83 | |
84 -whitelist fname Remove reads that aren't on this list (by name) | |
85 -blacklist fname Remove reads that are on this list (by name) | |
86 These lists can be whitespace-delimited with | |
87 the read name as the first column. | |
88 -maximum_mismatch_ratio val | |
89 Filter by maximum mismatch ratio (fraction of length) | |
90 | |
91 -eq tag_name value | |
92 -lt tag_name value | |
93 -lte tag_name value | |
94 -gt tag_name value | |
95 -gte tag_name value | |
96 | |
97 As a special case, "MAPQ" can be used as the tag_name and the SAM MAPQ | |
98 value will be used. | |
99 | |
100 Common tags to filter by: | |
101 AS Alignment score | |
102 IH Number of alignments | |
103 NM Edit distance (each indel counts as many as its length) | |
104 | |
105 MAPQ Mapping quality (defined as part of SAM spec) | |
106 | |
107 The tag type (:i, :f, :Z) is optional. | |
108 | |
109 """ | |
110 | |
111 import os | |
112 import sys | |
113 import pysam | |
114 from ngsutils.bam import bam_iter | |
115 from ngsutils.support.dbsnp import DBSNP | |
116 from ngsutils.bam import read_calc_mismatches, read_calc_mismatches_ref, read_calc_mismatches_gen, read_calc_variations | |
117 from ngsutils.bed import BedFile | |
118 | |
119 | |
120 def usage(): | |
121 print __doc__ | |
122 print """ | |
123 Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria... | |
124 | |
125 Options: | |
126 -failed fname A text file containing the read names of all reads | |
127 that were removed with filtering | |
128 | |
129 Example: | |
130 bamutils filter filename.bam output.bam -mapped -gte AS:i 1000 | |
131 | |
132 This will remove all unmapped reads, as well as any reads that have an AS:i | |
133 value less than 1000. | |
134 """ | |
135 sys.exit(1) | |
136 | |
137 | |
138 class Unique(object): | |
139 def __init__(self, length=None): | |
140 if length: | |
141 self.length = int(length) | |
142 else: | |
143 self.length = None | |
144 | |
145 self.last_pos = None | |
146 self.pos_reads = set() | |
147 | |
148 def __repr__(self): | |
149 return "uniq" | |
150 | |
151 def filter(self, bam, read): | |
152 if self.last_pos != (read.tid, read.pos): | |
153 self.last_pos = (read.tid, read.pos) | |
154 self.pos_reads = set() | |
155 | |
156 if read.is_reverse: | |
157 seq = read.seq[::-1] # ignore revcomp for now, it isn't needed, just need to compare in the proper order | |
158 else: | |
159 seq = read.seq | |
160 | |
161 if self.length: | |
162 seq = seq[:self.length] | |
163 | |
164 if seq in self.pos_reads: | |
165 return False | |
166 | |
167 self.pos_reads.add(seq) | |
168 return True | |
169 | |
170 def close(self): | |
171 pass | |
172 | |
173 | |
174 class UniqueStart(object): | |
175 def __init__(self): | |
176 self.last_tid = None | |
177 self.last_fwd_pos = -1 | |
178 self.rev_pos_list = None | |
179 | |
180 def __repr__(self): | |
181 return "uniq_start" | |
182 | |
183 def filter(self, bam, read): | |
184 if self.last_tid is None or self.last_tid != read.tid: | |
185 self.last_tid = read.tid | |
186 self.rev_pos = set() | |
187 self.last_fwd_pos = -1 | |
188 self.last_rev_pos = -1 | |
189 | |
190 if read.is_reverse: | |
191 # check reverse reads from their start (3' aend) | |
192 # these aren't necessarily in the correct | |
193 # order in the file, so we have to track them in a set | |
194 | |
195 start_pos = read.aend | |
196 | |
197 # clean up hash if necesary | |
198 # Allow up to 100k over, to balance cleaning up the rev_pos hash | |
199 # and memory | |
200 if read.pos > (self.last_rev_pos + 100000): | |
201 self.last_rev_pos = start_pos | |
202 del_list = [] | |
203 for k in self.rev_pos: | |
204 if k < read.pos: | |
205 del_list.append(k) | |
206 | |
207 for k in del_list: | |
208 self.rev_pos.remove(k) | |
209 | |
210 if not start_pos in self.rev_pos: | |
211 self.rev_pos.add(start_pos) | |
212 return True | |
213 return False | |
214 else: | |
215 if read.pos != self.last_fwd_pos: | |
216 self.last_fwd_pos = read.pos | |
217 return True | |
218 return False | |
219 | |
220 # if self.last_pos != (read.tid, read.pos): | |
221 # self.last_pos = (read.tid, read.pos) | |
222 # return True | |
223 # return False | |
224 | |
225 def close(self): | |
226 pass | |
227 | |
228 | |
229 class Blacklist(object): | |
230 def __init__(self, fname): | |
231 self.fname = fname | |
232 self.notallowed = set() | |
233 with open(fname) as f: | |
234 for line in f: | |
235 self.notallowed.add(line.strip().split()[0]) | |
236 | |
237 def filter(self, bam, read): | |
238 return read.qname not in self.notallowed | |
239 | |
240 def __repr__(self): | |
241 return 'Blacklist: %s' % (self.fname) | |
242 | |
243 def close(self): | |
244 pass | |
245 | |
246 | |
247 class Whitelist(object): | |
248 def __init__(self, fname): | |
249 self.fname = fname | |
250 self.allowed = set() | |
251 with open(fname) as f: | |
252 for line in f: | |
253 self.allowed.add(line.strip().split()[0]) | |
254 | |
255 def filter(self, bam, read): | |
256 return read.qname in self.allowed | |
257 | |
258 def __repr__(self): | |
259 return 'Whitelist: %s' % (self.fname) | |
260 | |
261 def close(self): | |
262 pass | |
263 | |
264 | |
265 class IncludeRegion(object): | |
266 _excludes = [] | |
267 _last = None | |
268 | |
269 def __init__(self, region): | |
270 IncludeRegion._excludes.append(ExcludeRegion(region)) | |
271 # self.excl = ExcludeRegion(region) | |
272 | |
273 def filter(self, bam, read): | |
274 if read == IncludeRegion._last: | |
275 return True | |
276 | |
277 IncludeRegion._last = read | |
278 | |
279 for excl in IncludeRegion._excludes: | |
280 if not excl.filter(bam, read): | |
281 return True | |
282 return False | |
283 | |
284 # return not self.excl.filter(bam,read) | |
285 def __repr__(self): | |
286 return 'Including: %s' % (self.excl.region) | |
287 | |
288 def close(self): | |
289 pass | |
290 | |
291 | |
292 class IncludeBED(object): | |
293 def __init__(self, fname, nostrand=None): | |
294 self.excl = ExcludeBED(fname, nostrand) | |
295 | |
296 def filter(self, bam, read): | |
297 return not self.excl.filter(bam, read) | |
298 | |
299 def __repr__(self): | |
300 return 'Including from BED: %s%s' % (self.excl.fname, ' nostrand' if self.excl.nostrand else '') | |
301 | |
302 def close(self): | |
303 pass | |
304 | |
305 | |
306 class ExcludeRegion(object): | |
307 def __init__(self, region): | |
308 self.region = region | |
309 spl = region.split(':') | |
310 self.chrom = spl[0] | |
311 se = [int(x) for x in spl[1].split('-')] | |
312 self.start = se[0] - 1 | |
313 self.end = se[1] | |
314 | |
315 def filter(self, bam, read): | |
316 if not read.is_unmapped: | |
317 if bam.getrname(read.tid) == self.chrom: | |
318 if self.start <= read.pos <= self.end: | |
319 return False | |
320 if self.start <= read.aend <= self.end: | |
321 return False | |
322 return True | |
323 | |
324 def __repr__(self): | |
325 return 'Excluding: %s' % (self.region) | |
326 | |
327 def close(self): | |
328 pass | |
329 | |
330 | |
331 class ExcludeRef(object): | |
332 def __init__(self, ref): | |
333 self.ref = ref | |
334 | |
335 def filter(self, bam, read): | |
336 if not read.is_unmapped: | |
337 if bam.getrname(read.tid) == self.ref: | |
338 return False | |
339 return True | |
340 | |
341 def __repr__(self): | |
342 return 'Excluding: %s' % (self.ref) | |
343 | |
344 def close(self): | |
345 pass | |
346 | |
347 class IncludeRef(object): | |
348 def __init__(self, ref): | |
349 self.ref = ref | |
350 | |
351 def filter(self, bam, read): | |
352 if not read.is_unmapped: | |
353 if bam.getrname(read.tid) == self.ref: | |
354 return True | |
355 return False | |
356 | |
357 def __repr__(self): | |
358 return 'Including: %s' % (self.ref) | |
359 | |
360 def close(self): | |
361 pass | |
362 | |
363 | |
364 class ExcludeBED(object): | |
365 def __init__(self, fname, nostrand=None): | |
366 self.regions = {} # store BED regions as keyed bins (chrom, bin) | |
367 self.fname = fname | |
368 if nostrand == 'nostrand': | |
369 self.nostrand = True | |
370 else: | |
371 self.nostrand = False | |
372 | |
373 self.bed = BedFile(fname) | |
374 # with open(fname) as f: | |
375 # for line in f: | |
376 # if not line: | |
377 # continue | |
378 # if line[0] == '#': | |
379 # continue | |
380 # cols = line.strip().split('\t') | |
381 | |
382 # chrom = cols[0] | |
383 # start = int(cols[1]) | |
384 # end = int(cols[2]) | |
385 # if self.nostrand: | |
386 # strand = '?' | |
387 # else: | |
388 # strand = cols[5] | |
389 | |
390 # startbin = start / 100000 | |
391 # endbin = end / 100000 | |
392 | |
393 # for bin in xrange(startbin, endbin + 1): | |
394 # if not (chrom, bin) in self.regions: | |
395 # self.regions[(chrom, bin)] = [] | |
396 # self.regions[(chrom, bin)].append((start, end, strand)) | |
397 | |
398 def filter(self, bam, read): | |
399 if not read.is_unmapped: | |
400 if self.nostrand: | |
401 strand = None | |
402 elif read.is_reverse: | |
403 strand = '-' | |
404 else: | |
405 strand = '+' | |
406 | |
407 for region in self.bed.fetch(bam.getrname(read.tid), read.pos, read.aend, strand): | |
408 # region found, exclude read | |
409 return False | |
410 return True | |
411 | |
412 # bin = read.pos / 100000 | |
413 # ref = bam.getrname(read.tid) | |
414 | |
415 # if not (ref, bin) in self.regions: | |
416 # return True | |
417 | |
418 # for start, end, strand in self.regions[(ref, bin)]: | |
419 # if not self.nostrand: | |
420 # if strand == '+' and read.is_reverse: | |
421 # continue | |
422 # if strand == '-' and not read.is_reverse: | |
423 # continue | |
424 # if start <= read.pos <= end: | |
425 # return False | |
426 # if start <= read.aend <= end: | |
427 # return False | |
428 # return True | |
429 | |
430 def __repr__(self): | |
431 return 'Excluding from BED: %s%s' % (self.fname, ' nostrand' if self.nostrand else '') | |
432 | |
433 def close(self): | |
434 pass | |
435 | |
436 | |
437 class Mismatch(object): | |
438 def __init__(self, num): | |
439 self.num = int(num) | |
440 | |
441 def filter(self, bam, read): | |
442 if read.is_unmapped: | |
443 return False | |
444 | |
445 if read_calc_mismatches(read) > self.num: | |
446 return False | |
447 | |
448 return True | |
449 | |
450 def __repr__(self): | |
451 return '>%s mismatch%s' % (self.num, '' if self.num == 1 else 'es') | |
452 | |
453 def close(self): | |
454 pass | |
455 | |
456 | |
457 class MismatchRef(object): | |
458 def __init__(self, num, refname): | |
459 self.num = int(num) | |
460 self.refname = refname | |
461 | |
462 if not os.path.exists('%s.fai' % refname): | |
463 pysam.faidx(refname) | |
464 | |
465 self.ref = pysam.Fastafile(refname) | |
466 | |
467 def filter(self, bam, read): | |
468 if read.is_unmapped: | |
469 return False | |
470 | |
471 chrom = bam.getrname(read.tid) | |
472 if read_calc_mismatches_ref(self.ref, read, chrom) > self.num: | |
473 return False | |
474 | |
475 return True | |
476 | |
477 def __repr__(self): | |
478 return '>%s mismatch%s in %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.refname)) | |
479 | |
480 def close(self): | |
481 self.ref.close() | |
482 | |
483 | |
484 class MismatchDbSNP(object): | |
485 def __init__(self, num, fname, verbose=None): | |
486 sys.stderr.write('Note: MismatchDbSNP is considered *experimental*\n') | |
487 | |
488 self.num = int(num) | |
489 self.fname = fname | |
490 self.dbsnp = DBSNP(fname) | |
491 if verbose == 'verbose': | |
492 self.verbose = True | |
493 else: | |
494 self.verbose = False | |
495 | |
496 def filter(self, bam, read): | |
497 if read.is_unmapped: | |
498 return False | |
499 | |
500 if read_calc_mismatches(read) <= self.num: | |
501 return True | |
502 | |
503 chrom = bam.getrname(read.tid) | |
504 | |
505 mm = 0 | |
506 snps = 0 | |
507 | |
508 for op, pos, seq in read_calc_variations(read): | |
509 if not self.dbsnp.is_valid_variation(chrom, op, pos, seq, self.verbose): | |
510 mm += 1 | |
511 else: | |
512 snps += 1 | |
513 | |
514 if mm > self.num: | |
515 return False | |
516 | |
517 if snps: | |
518 read.tags = read.tags + [('ZS', snps)] | |
519 | |
520 return True | |
521 | |
522 def __repr__(self): | |
523 return '>%s mismatch%s using %s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.fname)) | |
524 | |
525 def close(self): | |
526 self.dbsnp.close() | |
527 | |
528 | |
529 class MismatchRefDbSNP(object): | |
530 def __init__(self, num, refname, dbsnpname): | |
531 sys.stderr.write('Note: MismatchRefDbSNP is considered *experimental*\n') | |
532 self.num = int(num) | |
533 self.refname = refname | |
534 self.dbsnp = DBSNP(dbsnpname) | |
535 | |
536 if not os.path.exists('%s.fai' % refname): | |
537 pysam.faidx(refname) | |
538 | |
539 self.ref = pysam.Fastafile(refname) | |
540 | |
541 def filter(self, bam, read): | |
542 if read.is_unmapped: | |
543 return False | |
544 | |
545 chrom = bam.getrname(read.tid) | |
546 | |
547 mm = 0 | |
548 snps = 0 | |
549 | |
550 for op, pos, seq in read_calc_mismatches_gen(self.ref, read, chrom): | |
551 if not self.dbsnp.is_valid_variation(chrom, op, pos, seq): | |
552 mm += 1 | |
553 else: | |
554 snps += 1 | |
555 | |
556 if mm > self.num: | |
557 return False | |
558 | |
559 if snps: | |
560 read.tags = read.tags + [('ZS', snps)] | |
561 | |
562 return True | |
563 | |
564 def __repr__(self): | |
565 return '>%s mismatch%s using %s/%s' % (self.num, '' if self.num == 1 else 'es', os.path.basename(self.dbsnpname), os.path.basename(self.refname)) | |
566 | |
567 def close(self): | |
568 self.ref.close() | |
569 self.dbsnp.close() | |
570 | |
571 | |
572 class Mapped(object): | |
573 def __init__(self): | |
574 pass | |
575 | |
576 def filter(self, bam, read): | |
577 if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): | |
578 return False | |
579 elif read.is_unmapped: | |
580 return False | |
581 return True | |
582 | |
583 def __repr__(self): | |
584 return 'is mapped' | |
585 | |
586 def close(self): | |
587 pass | |
588 | |
589 | |
590 class Unmapped(object): | |
591 def __init__(self): | |
592 pass | |
593 | |
594 def filter(self, bam, read): | |
595 if read.is_paired and (read.is_unmapped or read.mate_is_unmapped): | |
596 return True | |
597 elif read.is_unmapped: | |
598 return True | |
599 return False | |
600 | |
601 def __repr__(self): | |
602 return 'is unmapped' | |
603 | |
604 def close(self): | |
605 pass | |
606 | |
607 | |
608 class ProperPair(object): | |
609 def __init__(self): | |
610 pass | |
611 | |
612 def filter(self, bam, read): | |
613 if not read.is_paired: | |
614 return False | |
615 | |
616 if read.is_unmapped or read.mate_is_unmapped: | |
617 return False | |
618 | |
619 if read.is_reverse == read.mate_is_reverse: | |
620 return False | |
621 | |
622 return read.is_proper_pair | |
623 | |
624 def __repr__(self): | |
625 return 'proper pair' | |
626 | |
627 def close(self): | |
628 pass | |
629 | |
630 | |
631 class NoProperPair(object): | |
632 def __init__(self): | |
633 self.proper = ProperPair() | |
634 pass | |
635 | |
636 def filter(self, bam, read): | |
637 return not self.proper.filter(bam, read) | |
638 | |
639 def __repr__(self): | |
640 return 'not proper pairs' | |
641 | |
642 def close(self): | |
643 pass | |
644 | |
645 | |
646 class MaskFlag(object): | |
647 def __init__(self, value): | |
648 if type(value) == type(1): | |
649 self.flag = value | |
650 else: | |
651 if value[0:2] == '0x': | |
652 self.flag = int(value, 16) | |
653 else: | |
654 self.flag = int(value) | |
655 | |
656 def __repr__(self): | |
657 return "Doesn't match flag: %s" % self.flag | |
658 | |
659 def filter(self, bam, read): | |
660 return (read.flag & self.flag) == 0 | |
661 | |
662 def close(self): | |
663 pass | |
664 | |
665 | |
666 class SecondaryFlag(object): | |
667 def __repr__(self): | |
668 return "no 0x100 (secondary) flag" | |
669 | |
670 def filter(self, bam, read): | |
671 return not read.is_secondary | |
672 | |
673 def close(self): | |
674 pass | |
675 | |
676 | |
677 class ReadMinLength(object): | |
678 def __init__(self, minval): | |
679 self.minval = int(minval) | |
680 | |
681 def __repr__(self): | |
682 return "read length min: %s" % self.minval | |
683 | |
684 def filter(self, bam, read): | |
685 return len(read.seq) >= self.minval | |
686 | |
687 def close(self): | |
688 pass | |
689 | |
690 | |
691 class ReadMaxLength(object): | |
692 def __init__(self, val): | |
693 self.val = int(val) | |
694 | |
695 def __repr__(self): | |
696 return "read length max: %s" % self.val | |
697 | |
698 def filter(self, bam, read): | |
699 return len(read.seq) <= self.val | |
700 | |
701 def close(self): | |
702 pass | |
703 | |
704 | |
705 class MaximumMismatchRatio(object): | |
706 def __init__(self, ratio): | |
707 self.ratio = float(ratio) | |
708 | |
709 def __repr__(self): | |
710 return "maximum mismatch ratio: %s" % self.val | |
711 | |
712 def filter(self, bam, read): | |
713 return read_calc_mismatches(read) <= self.ratio*len(read.seq) | |
714 | |
715 def close(self): | |
716 pass | |
717 | |
718 | |
719 class QCFailFlag(object): | |
720 def __repr__(self): | |
721 return "no 0x200 (qcfail) flag" | |
722 | |
723 def filter(self, bam, read): | |
724 return not read.is_qcfail | |
725 | |
726 def close(self): | |
727 pass | |
728 | |
729 | |
730 class PCRDupFlag(object): | |
731 def __repr__(self): | |
732 return "no 0x400 (pcrdup) flag" | |
733 | |
734 def filter(self, bam, read): | |
735 return not read.is_duplicate | |
736 | |
737 def close(self): | |
738 pass | |
739 | |
740 | |
741 class _TagCompare(object): | |
742 def __init__(self, tag, value): | |
743 self.args = '%s %s' % (tag, value) | |
744 | |
745 if ':' in tag: | |
746 self.tag = tag.split(':')[0] | |
747 tagtype = tag.split(':')[1] | |
748 if tagtype == 'i': | |
749 self.value = int(value) | |
750 elif tagtype == 'f': | |
751 self.value = float(value) | |
752 elif tagtype == 'H': | |
753 self.value = int(value) # not sure about this | |
754 else: | |
755 self.value = value | |
756 else: | |
757 self.tag = tag | |
758 | |
759 # guess at type... | |
760 try: | |
761 self.value = int(value) | |
762 except: | |
763 try: | |
764 self.value = float(value) | |
765 except: | |
766 self.value = value | |
767 | |
768 def get_value(self, read): | |
769 if self.tag == 'MAPQ': | |
770 return read.mapq | |
771 | |
772 for name, value in read.tags: | |
773 if name == self.tag: | |
774 return value | |
775 | |
776 return None | |
777 | |
778 def __repr__(self): | |
779 return "%s %s %s" % (self.tag, self.__class__.op, self.value) | |
780 | |
781 def close(self): | |
782 pass | |
783 | |
784 | |
785 class TagLessThan(_TagCompare): | |
786 op = '<' | |
787 | |
788 def filter(self, bam, read): | |
789 if self.get_value(read) < self.value: | |
790 return True | |
791 return False | |
792 | |
793 | |
794 class TagLessThanEqual(_TagCompare): | |
795 op = '<=' | |
796 | |
797 def filter(self, bam, read): | |
798 if self.get_value(read) <= self.value: | |
799 return True | |
800 return False | |
801 | |
802 | |
803 class TagGreaterThan(_TagCompare): | |
804 op = '>' | |
805 | |
806 def filter(self, bam, read): | |
807 if self.get_value(read) > self.value: | |
808 return True | |
809 return False | |
810 | |
811 | |
812 class TagGreaterThanEqual(_TagCompare): | |
813 op = '>=' | |
814 | |
815 def filter(self, bam, read): | |
816 if self.get_value(read) >= self.value: | |
817 return True | |
818 return False | |
819 | |
820 | |
821 class TagEqual(_TagCompare): | |
822 op = '=' | |
823 | |
824 def filter(self, bam, read): | |
825 if self.get_value(read) == self.value: | |
826 return True | |
827 return False | |
828 | |
829 _criteria = { | |
830 'mapped': Mapped, | |
831 'unmapped': Unmapped, | |
832 'properpair': ProperPair, | |
833 'noproperpair': NoProperPair, | |
834 'noqcfail': QCFailFlag, | |
835 'nosecondary': SecondaryFlag, | |
836 'nopcrdup': PCRDupFlag, | |
837 'mask': MaskFlag, | |
838 'lt': TagLessThan, | |
839 'gt': TagGreaterThan, | |
840 'lte': TagLessThanEqual, | |
841 'gte': TagGreaterThanEqual, | |
842 'eq': TagEqual, | |
843 'mismatch': Mismatch, | |
844 'mismatch_ref': MismatchRef, | |
845 'mismatch_dbsnp': MismatchDbSNP, | |
846 'mismatch_ref_dbsnp': MismatchRefDbSNP, | |
847 'exclude': ExcludeRegion, | |
848 'excludebed': ExcludeBED, | |
849 'excluderef': ExcludeRef, | |
850 'includeref': IncludeRef, | |
851 'include': IncludeRegion, | |
852 'includebed': IncludeBED, | |
853 'whitelist': Whitelist, | |
854 'blacklist': Blacklist, | |
855 'length': ReadMinLength, # deprecated | |
856 'minlen': ReadMinLength, | |
857 'maxlen': ReadMaxLength, | |
858 'uniq': Unique, | |
859 'maximum_mismatch_ratio': MaximumMismatchRatio | |
860 } | |
861 | |
862 | |
863 def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False): | |
864 if verbose: | |
865 sys.stderr.write('Input file : %s\n' % infile) | |
866 sys.stderr.write('Output file : %s\n' % outfile) | |
867 if failedfile: | |
868 sys.stderr.write('Failed reads: %s\n' % failedfile) | |
869 sys.stderr.write('Criteria:\n') | |
870 for criterion in criteria: | |
871 sys.stderr.write(' %s\n' % criterion) | |
872 | |
873 sys.stderr.write('\n') | |
874 | |
875 bamfile = pysam.Samfile(infile, "rb") | |
876 outfile = pysam.Samfile(outfile, "wb", template=bamfile) | |
877 | |
878 if failedfile: | |
879 failed_out = open(failedfile, 'w') | |
880 else: | |
881 failed_out = None | |
882 | |
883 passed = 0 | |
884 failed = 0 | |
885 | |
886 def _callback(read): | |
887 return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed) | |
888 | |
889 for read in bam_iter(bamfile, quiet=True): | |
890 p = True | |
891 | |
892 for criterion in criteria: | |
893 if not criterion.filter(bamfile, read): | |
894 p = False | |
895 failed += 1 | |
896 if failed_out: | |
897 failed_out.write('%s\t%s\n' % (read.qname, criterion)) | |
898 #outfile.write(read_to_unmapped(read)) | |
899 break | |
900 if p: | |
901 passed += 1 | |
902 outfile.write(read) | |
903 | |
904 bamfile.close() | |
905 outfile.close() | |
906 if failed_out: | |
907 failed_out.close() | |
908 sys.stdout.write("%s kept\n%s failed\n" % (passed, failed)) | |
909 | |
910 for criterion in criteria: | |
911 criterion.close() | |
912 | |
913 | |
914 def read_to_unmapped(read): | |
915 ''' | |
916 Example unmapped read | |
917 2358_2045_1839 | 4 | * | 0 | 0 | * | * | 0 | 0 | GGACGAGAAGGAGTATTTCTCCGAGAACACATTCACGGAGAGTCTAACTC | 0QT\VNO^IIRJKXTIHIKTY\STZZ[XOJKPWLHJQQQ^XPQYIIRQII | PG:Z:bfast | AS:i:- | NH:i:0 | IH:i:1 | HI:i:1 | CS:Z:T10213222020221330022203222011113021130222212230122 | CQ:Z:019=2+><%@.'>52%)'15?90<7;:5+(-49('-7-<>5-@5%<.2%= | CM:i:0 | XA:i:4 | |
918 | |
919 Example mapped read: | |
920 2216_1487_198 | 16 | chr11:19915291-19925392 | 5531 | 12 | 24M2D26M | * | 0 | 0 | TCTGTCTGGGTGCAGTAGCTATACGTAATCCCAGCACTTTGGGAGGCCAA | 1C8FZ`M""""""WZ""%"#\I;"`R@D]``ZZ``\QKX\Y]````IK^` | PG:Z:bfast | AS:i:1150 | NM:i:2 | NH:i:10 | IH:i:1 | HI:i:1 | MD:Z:24^CT26 | CS:Z:T00103022001002113210023031213331122121223321221122 | CQ:Z:A8=%;AB<;97<3/9:>>3>@?5&18@-%;866:94=14:=?,%?:7&)1 | CM:i:9 | XA:i:4 | XE:Z:-------23322---2-11----2---------------------------- | |
921 | |
922 ''' | |
923 # TODO: rewrite read properties to unmapped state | |
924 # if colorspace: convert CS to NT directly | |
925 # remove excess tags | |
926 | |
927 read.is_unmapped = True | |
928 read.tid = None | |
929 read.pos = 0 | |
930 read.mapq = 0 | |
931 return read | |
932 | |
933 if __name__ == '__main__': | |
934 infile = None | |
935 outfile = None | |
936 failed = None | |
937 criteria = [] | |
938 | |
939 crit_args = [] | |
940 last = None | |
941 verbose = False | |
942 fail = False | |
943 | |
944 for arg in sys.argv[1:]: | |
945 if last == '-failed': | |
946 failed = arg | |
947 last = None | |
948 elif arg == '-h': | |
949 usage() | |
950 elif arg == '-failed': | |
951 last = arg | |
952 elif arg == '-v': | |
953 verbose = True | |
954 elif not infile and os.path.exists(arg): | |
955 infile = arg | |
956 elif not outfile: | |
957 outfile = arg | |
958 elif arg[0] == '-': | |
959 if not arg[1:] in _criteria: | |
960 print "Unknown criterion: %s" % arg | |
961 fail = True | |
962 if crit_args: | |
963 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) | |
964 crit_args = [arg, ] | |
965 elif crit_args: | |
966 crit_args.append(arg) | |
967 else: | |
968 print "Unknown argument: %s" % arg | |
969 fail = True | |
970 | |
971 if not fail and crit_args: | |
972 criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) | |
973 | |
974 if fail or not infile or not outfile or not criteria: | |
975 if not infile and not outfile and not criteria: | |
976 usage() | |
977 | |
978 if not infile: | |
979 print "Missing: input bamfile" | |
980 if not outfile: | |
981 print "Missing: output bamfile" | |
982 if not criteria: | |
983 print "Missing: filtering criteria" | |
984 usage() | |
985 else: | |
986 bam_filter(infile, outfile, criteria, failed, verbose) |