annotate pyPRADA_1.2/tools/samtools-0.1.16/bam_rmdupse.c @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #include <math.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 #include "sam.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #include "khash.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #include "klist.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 #define QUEUE_CLEAR_SIZE 0x100000
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 #define MAX_POS 0x7fffffff
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 typedef struct {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 int endpos;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 uint32_t score:31, discarded:1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 bam1_t *b;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 } elem_t, *elem_p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 #define __free_elem(p) bam_destroy1((p)->data.b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 KLIST_INIT(q, elem_t, __free_elem)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 typedef klist_t(q) queue_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 KHASH_MAP_INIT_INT(best, elem_p)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 typedef khash_t(best) besthash_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 typedef struct {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 uint64_t n_checked, n_removed;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 besthash_t *left, *rght;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 } lib_aux_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 KHASH_MAP_INIT_STR(lib, lib_aux_t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 khint_t k = kh_get(lib, aux, lib);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 if (k == kh_end(aux)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 int ret;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 char *p = strdup(lib);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 lib_aux_t *q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 k = kh_put(lib, aux, p, &ret);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 q = &kh_val(aux, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 q->left = kh_init(best);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 q->rght = kh_init(best);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 q->n_checked = q->n_removed = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 return q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 } else return &kh_val(aux, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 static inline int sum_qual(const bam1_t *b)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 int i, q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 uint8_t *qual = bam1_qual(b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 return q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 elem_t *p = kl_pushp(q, queue);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 p->discarded = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 p->endpos = endpos; p->score = score;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 if (p->b == 0) p->b = bam_init1();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 bam_copy1(p->b, b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 return p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 static void clear_besthash(besthash_t *h, int32_t pos)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 for (k = kh_begin(h); k != kh_end(h); ++k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 kh_del(best, h, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 while (1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 elem_t *q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 if (queue->head == queue->tail) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 q = &kl_val(queue->head);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 if (q->discarded) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 q->b->data_len = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 kl_shift(q, queue, 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 samwrite(out, q->b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 q->b->data_len = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 kl_shift(q, queue, 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 for (k = kh_begin(h); k != kh_end(h); ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 if (kh_exist(h, k)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 clear_besthash(kh_val(h, k).left, pos);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 clear_besthash(kh_val(h, k).rght, pos);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 bam1_t *b;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 queue_t *queue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 int last_tid = -2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 khash_t(lib) *aux;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 aux = kh_init(lib);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 b = bam_init1();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 queue = kl_init(q);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 while (samread(in, b) >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 bam1_core_t *c = &b->core;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 int endpos = bam_calend(c, bam1_cigar(b));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 int score = sum_qual(b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 if (last_tid != c->tid) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 last_tid = c->tid;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 } else dump_alignment(out, queue, c->pos, aux);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 push_queue(queue, b, endpos, score);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 const char *lib;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 lib_aux_t *q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 besthash_t *h;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 uint32_t key;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 int ret;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 lib = bam_get_library(in->header, b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 ++q->n_checked;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 k = kh_put(best, h, key, &ret);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if (ret == 0) { // in the hash table
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 elem_t *p = kh_val(h, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 ++q->n_removed;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 if (p->score < score) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 p->discarded = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 kh_val(h, k) = push_queue(queue, b, endpos, score);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 } else { // replace
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 p->score = score; p->endpos = endpos;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 bam_copy1(p->b, b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 } // otherwise, discard the alignment
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 } else kh_val(h, k) = push_queue(queue, b, endpos, score);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 dump_alignment(out, queue, MAX_POS, aux);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 for (k = kh_begin(aux); k != kh_end(aux); ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 if (kh_exist(aux, k)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 lib_aux_t *q = &kh_val(aux, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 kh_destroy(best, q->left); kh_destroy(best, q->rght);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 free((char*)kh_key(aux, k));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 kh_destroy(lib, aux);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 bam_destroy1(b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 kl_destroy(q, queue);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 }