comparison bwa-0.6.2/bwt.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
comparison
equal deleted inserted replaced
1:a9636dc1e99a 2:a294fbfcb1db
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <assert.h>
32 #include <stdint.h>
33 #include "utils.h"
34 #include "bwt.h"
35 #include "kvec.h"
36
37 void bwt_gen_cnt_table(bwt_t *bwt)
38 {
39 int i, j;
40 for (i = 0; i != 256; ++i) {
41 uint32_t x = 0;
42 for (j = 0; j != 4; ++j)
43 x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
44 bwt->cnt_table[i] = x;
45 }
46 }
47
48 // bwt->bwt and bwt->occ must be precalculated
49 void bwt_cal_sa(bwt_t *bwt, int intv)
50 {
51 bwtint_t isa, sa, i; // S(isa) = sa
52 int intv_round = intv;
53
54 kv_roundup32(intv_round);
55 xassert(intv_round == intv, "SA sample interval is not a power of 2.");
56 xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
57
58 if (bwt->sa) free(bwt->sa);
59 bwt->sa_intv = intv;
60 bwt->n_sa = (bwt->seq_len + intv) / intv;
61 bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
62 if (bwt->sa == 0) {
63 fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0);
64 abort();
65 }
66 // calculate SA value
67 isa = 0; sa = bwt->seq_len;
68 for (i = 0; i < bwt->seq_len; ++i) {
69 if (isa % intv == 0) bwt->sa[isa/intv] = sa;
70 --sa;
71 isa = bwt_invPsi(bwt, isa);
72 }
73 if (isa % intv == 0) bwt->sa[isa/intv] = sa;
74 bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
75 }
76
77 bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
78 {
79 bwtint_t sa = 0, mask = bwt->sa_intv - 1;
80 while (k & mask) {
81 ++sa;
82 k = bwt_invPsi(bwt, k);
83 }
84 /* without setting bwt->sa[0] = -1, the following line should be
85 changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
86 return sa + bwt->sa[k/bwt->sa_intv];
87 }
88
89 static inline int __occ_aux(uint64_t y, int c)
90 {
91 // reduce nucleotide counting to bits counting
92 y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
93 // count the number of 1s in y
94 y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
95 return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
96 }
97
98 inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
99 {
100 bwtint_t n, l, j;
101 uint32_t *p;
102
103 if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
104 if (k == (bwtint_t)(-1)) return 0;
105 if (k >= bwt->primary) --k; // because $ is not in bwt
106
107 // retrieve Occ at k/OCC_INTERVAL
108 n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
109 p += sizeof(bwtint_t); // jump to the start of the first BWT cell
110
111 // calculate Occ up to the last k/32
112 j = k >> 5 << 5;
113 for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
114 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
115
116 // calculate Occ
117 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
118 if (c == 0) n -= ~k&31; // corrected for the masked bits
119
120 return n;
121 }
122
123 // an analogy to bwt_occ() but more efficient, requiring k <= l
124 inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
125 {
126 bwtint_t _k, _l;
127 _k = (k >= bwt->primary)? k-1 : k;
128 _l = (l >= bwt->primary)? l-1 : l;
129 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
130 *ok = bwt_occ(bwt, k, c);
131 *ol = bwt_occ(bwt, l, c);
132 } else {
133 bwtint_t m, n, i, j;
134 uint32_t *p;
135 if (k >= bwt->primary) --k;
136 if (l >= bwt->primary) --l;
137 n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
138 p += sizeof(bwtint_t);
139 // calculate *ok
140 j = k >> 5 << 5;
141 for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
142 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
143 m = n;
144 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
145 if (c == 0) n -= ~k&31; // corrected for the masked bits
146 *ok = n;
147 // calculate *ol
148 j = l >> 5 << 5;
149 for (; i < j; i += 32, p += 2)
150 m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
151 m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
152 if (c == 0) m -= ~l&31; // corrected for the masked bits
153 *ol = m;
154 }
155 }
156
157 #define __occ_aux4(bwt, b) \
158 ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
159 + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
160
161 inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
162 {
163 bwtint_t l, j, x;
164 uint32_t *p;
165 if (k == (bwtint_t)(-1)) {
166 memset(cnt, 0, 4 * sizeof(bwtint_t));
167 return;
168 }
169 if (k >= bwt->primary) --k; // because $ is not in bwt
170 p = bwt_occ_intv(bwt, k);
171 memcpy(cnt, p, 4 * sizeof(bwtint_t));
172 p += sizeof(bwtint_t);
173 j = k >> 4 << 4;
174 for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
175 x += __occ_aux4(bwt, *p);
176 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
177 cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
178 }
179
180 // an analogy to bwt_occ4() but more efficient, requiring k <= l
181 inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
182 {
183 bwtint_t _k, _l;
184 _k = (k >= bwt->primary)? k-1 : k;
185 _l = (l >= bwt->primary)? l-1 : l;
186 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
187 bwt_occ4(bwt, k, cntk);
188 bwt_occ4(bwt, l, cntl);
189 } else {
190 bwtint_t i, j, x, y;
191 uint32_t *p;
192 if (k >= bwt->primary) --k; // because $ is not in bwt
193 if (l >= bwt->primary) --l;
194 p = bwt_occ_intv(bwt, k);
195 memcpy(cntk, p, 4 * sizeof(bwtint_t));
196 p += sizeof(bwtint_t);
197 // prepare cntk[]
198 j = k >> 4 << 4;
199 for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
200 x += __occ_aux4(bwt, *p);
201 y = x;
202 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
203 // calculate cntl[] and finalize cntk[]
204 j = l >> 4 << 4;
205 for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
206 y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
207 memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
208 cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
209 cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
210 }
211 }
212
213 int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
214 {
215 bwtint_t k, l, ok, ol;
216 int i;
217 k = 0; l = bwt->seq_len;
218 for (i = len - 1; i >= 0; --i) {
219 ubyte_t c = str[i];
220 if (c > 3) return 0; // no match
221 bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
222 k = bwt->L2[c] + ok + 1;
223 l = bwt->L2[c] + ol;
224 if (k > l) break; // no match
225 }
226 if (k > l) return 0; // no match
227 if (sa_begin) *sa_begin = k;
228 if (sa_end) *sa_end = l;
229 return l - k + 1;
230 }
231
232 int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
233 {
234 int i;
235 bwtint_t k, l, ok, ol;
236 k = *k0; l = *l0;
237 for (i = len - 1; i >= 0; --i) {
238 ubyte_t c = str[i];
239 if (c > 3) return 0; // there is an N here. no match
240 bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
241 k = bwt->L2[c] + ok + 1;
242 l = bwt->L2[c] + ol;
243 if (k > l) return 0; // no match
244 }
245 *k0 = k; *l0 = l;
246 return l - k + 1;
247 }
248
249 /*********************
250 * Bidirectional BWT *
251 *********************/
252
253 void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
254 {
255 bwtint_t tk[4], tl[4];
256 int i;
257 bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
258 for (i = 0; i != 4; ++i) {
259 ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
260 ok[i].x[2] = tl[i] - tk[i];
261 }
262 ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
263 ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
264 ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
265 ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
266 }
267
268 static void bwt_reverse_intvs(bwtintv_v *p)
269 {
270 if (p->n > 1) {
271 int j;
272 for (j = 0; j < p->n>>1; ++j) {
273 bwtintv_t tmp = p->a[p->n - 1 - j];
274 p->a[p->n - 1 - j] = p->a[j];
275 p->a[j] = tmp;
276 }
277 }
278 }
279
280 int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
281 {
282 int i, j, c, ret;
283 bwtintv_t ik, ok[4];
284 bwtintv_v a[2], *prev, *curr, *swap;
285
286 mem->n = 0;
287 if (q[x] > 3) return x + 1;
288 kv_init(a[0]); kv_init(a[1]);
289 prev = tmpvec[0]? tmpvec[0] : &a[0];
290 curr = tmpvec[1]? tmpvec[1] : &a[1];
291 bwt_set_intv(bwt, q[x], ik);
292 ik.info = x + 1;
293
294 for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
295 if (q[i] < 4) {
296 c = 3 - q[i];
297 bwt_extend(bwt, &ik, ok, 0);
298 if (ok[c].x[2] != ik.x[2]) // change of the interval size
299 kv_push(bwtintv_t, *curr, ik);
300 if (ok[c].x[2] == 0) break; // cannot be extended
301 ik = ok[c]; ik.info = i + 1;
302 } else { // an ambiguous base
303 kv_push(bwtintv_t, *curr, ik);
304 break; // cannot be extended; in this case, i<len always stands
305 }
306 }
307 if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
308 bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
309 ret = curr->a[0].info; // this will be the returned value
310 swap = curr; curr = prev; prev = swap;
311
312 for (i = x - 1; i >= -1; --i) { // backward search for MEMs
313 if (q[i] > 3) break;
314 c = i < 0? 0 : q[i];
315 for (j = 0, curr->n = 0; j < prev->n; ++j) {
316 bwtintv_t *p = &prev->a[j];
317 bwt_extend(bwt, p, ok, 1);
318 if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
319 if (curr->n == 0) { // curr->n to make sure there is no longer matches
320 if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
321 ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
322 kv_push(bwtintv_t, *mem, ik);
323 }
324 } // otherwise the match is contained in another longer match
325 }
326 if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
327 ok[c].info = p->info;
328 kv_push(bwtintv_t, *curr, ok[c]);
329 }
330 }
331 if (curr->n == 0) break;
332 swap = curr; curr = prev; prev = swap;
333 }
334 bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
335
336 if (tmpvec[0] == 0) free(a[0].a);
337 if (tmpvec[1] == 0) free(a[1].a);
338 return ret;
339 }