0
|
1 #include <math.h>
|
|
2 #include "sam.h"
|
|
3 #include "khash.h"
|
|
4 #include "klist.h"
|
|
5
|
|
6 #define QUEUE_CLEAR_SIZE 0x100000
|
|
7 #define MAX_POS 0x7fffffff
|
|
8
|
|
9 typedef struct {
|
|
10 int endpos;
|
|
11 uint32_t score:31, discarded:1;
|
|
12 bam1_t *b;
|
|
13 } elem_t, *elem_p;
|
|
14 #define __free_elem(p) bam_destroy1((p)->data.b)
|
|
15 KLIST_INIT(q, elem_t, __free_elem)
|
|
16 typedef klist_t(q) queue_t;
|
|
17
|
|
18 KHASH_MAP_INIT_INT(best, elem_p)
|
|
19 typedef khash_t(best) besthash_t;
|
|
20
|
|
21 typedef struct {
|
|
22 uint64_t n_checked, n_removed;
|
|
23 besthash_t *left, *rght;
|
|
24 } lib_aux_t;
|
|
25 KHASH_MAP_INIT_STR(lib, lib_aux_t)
|
|
26
|
|
27 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
|
|
28 {
|
|
29 khint_t k = kh_get(lib, aux, lib);
|
|
30 if (k == kh_end(aux)) {
|
|
31 int ret;
|
|
32 char *p = strdup(lib);
|
|
33 lib_aux_t *q;
|
|
34 k = kh_put(lib, aux, p, &ret);
|
|
35 q = &kh_val(aux, k);
|
|
36 q->left = kh_init(best);
|
|
37 q->rght = kh_init(best);
|
|
38 q->n_checked = q->n_removed = 0;
|
|
39 return q;
|
|
40 } else return &kh_val(aux, k);
|
|
41 }
|
|
42
|
|
43 static inline int sum_qual(const bam1_t *b)
|
|
44 {
|
|
45 int i, q;
|
|
46 uint8_t *qual = bam1_qual(b);
|
|
47 for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
|
|
48 return q;
|
|
49 }
|
|
50
|
|
51 static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
|
|
52 {
|
|
53 elem_t *p = kl_pushp(q, queue);
|
|
54 p->discarded = 0;
|
|
55 p->endpos = endpos; p->score = score;
|
|
56 if (p->b == 0) p->b = bam_init1();
|
|
57 bam_copy1(p->b, b);
|
|
58 return p;
|
|
59 }
|
|
60
|
|
61 static void clear_besthash(besthash_t *h, int32_t pos)
|
|
62 {
|
|
63 khint_t k;
|
|
64 for (k = kh_begin(h); k != kh_end(h); ++k)
|
|
65 if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
|
|
66 kh_del(best, h, k);
|
|
67 }
|
|
68
|
|
69 static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
|
|
70 {
|
|
71 if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
|
|
72 khint_t k;
|
|
73 while (1) {
|
|
74 elem_t *q;
|
|
75 if (queue->head == queue->tail) break;
|
|
76 q = &kl_val(queue->head);
|
|
77 if (q->discarded) {
|
|
78 q->b->data_len = 0;
|
|
79 kl_shift(q, queue, 0);
|
|
80 continue;
|
|
81 }
|
|
82 if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
|
|
83 samwrite(out, q->b);
|
|
84 q->b->data_len = 0;
|
|
85 kl_shift(q, queue, 0);
|
|
86 }
|
|
87 for (k = kh_begin(h); k != kh_end(h); ++k) {
|
|
88 if (kh_exist(h, k)) {
|
|
89 clear_besthash(kh_val(h, k).left, pos);
|
|
90 clear_besthash(kh_val(h, k).rght, pos);
|
|
91 }
|
|
92 }
|
|
93 }
|
|
94 }
|
|
95
|
|
96 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
|
|
97 {
|
|
98 bam1_t *b;
|
|
99 queue_t *queue;
|
|
100 khint_t k;
|
|
101 int last_tid = -2;
|
|
102 khash_t(lib) *aux;
|
|
103
|
|
104 aux = kh_init(lib);
|
|
105 b = bam_init1();
|
|
106 queue = kl_init(q);
|
|
107 while (samread(in, b) >= 0) {
|
|
108 bam1_core_t *c = &b->core;
|
|
109 int endpos = bam_calend(c, bam1_cigar(b));
|
|
110 int score = sum_qual(b);
|
|
111
|
|
112 if (last_tid != c->tid) {
|
|
113 if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
|
|
114 last_tid = c->tid;
|
|
115 } else dump_alignment(out, queue, c->pos, aux);
|
|
116 if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
|
|
117 push_queue(queue, b, endpos, score);
|
|
118 } else {
|
|
119 const char *lib;
|
|
120 lib_aux_t *q;
|
|
121 besthash_t *h;
|
|
122 uint32_t key;
|
|
123 int ret;
|
|
124 lib = bam_get_library(in->header, b);
|
|
125 q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
|
|
126 ++q->n_checked;
|
|
127 h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
|
|
128 key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
|
|
129 k = kh_put(best, h, key, &ret);
|
|
130 if (ret == 0) { // in the hash table
|
|
131 elem_t *p = kh_val(h, k);
|
|
132 ++q->n_removed;
|
|
133 if (p->score < score) {
|
|
134 if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
|
|
135 p->discarded = 1;
|
|
136 kh_val(h, k) = push_queue(queue, b, endpos, score);
|
|
137 } else { // replace
|
|
138 p->score = score; p->endpos = endpos;
|
|
139 bam_copy1(p->b, b);
|
|
140 }
|
|
141 } // otherwise, discard the alignment
|
|
142 } else kh_val(h, k) = push_queue(queue, b, endpos, score);
|
|
143 }
|
|
144 }
|
|
145 dump_alignment(out, queue, MAX_POS, aux);
|
|
146
|
|
147 for (k = kh_begin(aux); k != kh_end(aux); ++k) {
|
|
148 if (kh_exist(aux, k)) {
|
|
149 lib_aux_t *q = &kh_val(aux, k);
|
|
150 fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
|
|
151 (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
|
|
152 kh_destroy(best, q->left); kh_destroy(best, q->rght);
|
|
153 free((char*)kh_key(aux, k));
|
|
154 }
|
|
155 }
|
|
156 kh_destroy(lib, aux);
|
|
157 bam_destroy1(b);
|
|
158 kl_destroy(q, queue);
|
|
159 }
|