Mercurial > repos > ashvark > qiime_1_8_0
diff bwa-0.6.2/bwt.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bwa-0.6.2/bwt.c Fri Jul 18 07:55:59 2014 -0400 @@ -0,0 +1,339 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@sanger.ac.uk> */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <stdint.h> +#include "utils.h" +#include "bwt.h" +#include "kvec.h" + +void bwt_gen_cnt_table(bwt_t *bwt) +{ + int i, j; + for (i = 0; i != 256; ++i) { + uint32_t x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + bwt->cnt_table[i] = x; + } +} + +// bwt->bwt and bwt->occ must be precalculated +void bwt_cal_sa(bwt_t *bwt, int intv) +{ + bwtint_t isa, sa, i; // S(isa) = sa + int intv_round = intv; + + kv_roundup32(intv_round); + xassert(intv_round == intv, "SA sample interval is not a power of 2."); + xassert(bwt->bwt, "bwt_t::bwt is not initialized."); + + if (bwt->sa) free(bwt->sa); + bwt->sa_intv = intv; + bwt->n_sa = (bwt->seq_len + intv) / intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + if (bwt->sa == 0) { + fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0); + abort(); + } + // calculate SA value + isa = 0; sa = bwt->seq_len; + for (i = 0; i < bwt->seq_len; ++i) { + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + --sa; + isa = bwt_invPsi(bwt, isa); + } + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len +} + +bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) +{ + bwtint_t sa = 0, mask = bwt->sa_intv - 1; + while (k & mask) { + ++sa; + k = bwt_invPsi(bwt, k); + } + /* without setting bwt->sa[0] = -1, the following line should be + changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ + return sa + bwt->sa[k/bwt->sa_intv]; +} + +static inline int __occ_aux(uint64_t y, int c) +{ + // reduce nucleotide counting to bits counting + y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; + // count the number of 1s in y + y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); + return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; +} + +inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) +{ + bwtint_t n, l, j; + uint32_t *p; + + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (bwtint_t)(-1)) return 0; + if (k >= bwt->primary) --k; // because $ is not in bwt + + // retrieve Occ at k/OCC_INTERVAL + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); // jump to the start of the first BWT cell + + // calculate Occ up to the last k/32 + j = k >> 5 << 5; + for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + + // calculate Occ + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + + return n; +} + +// an analogy to bwt_occ() but more efficient, requiring k <= l +inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) +{ + bwtint_t _k, _l; + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + *ok = bwt_occ(bwt, k, c); + *ol = bwt_occ(bwt, l, c); + } else { + bwtint_t m, n, i, j; + uint32_t *p; + if (k >= bwt->primary) --k; + if (l >= bwt->primary) --l; + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); + // calculate *ok + j = k >> 5 << 5; + for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m = n; + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + *ok = n; + // calculate *ol + j = l >> 5 << 5; + for (; i < j; i += 32, p += 2) + m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); + if (c == 0) m -= ~l&31; // corrected for the masked bits + *ol = m; + } +} + +#define __occ_aux4(bwt, b) \ + ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) + +inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +{ + bwtint_t l, j, x; + uint32_t *p; + if (k == (bwtint_t)(-1)) { + memset(cnt, 0, 4 * sizeof(bwtint_t)); + return; + } + if (k >= bwt->primary) --k; // because $ is not in bwt + p = bwt_occ_intv(bwt, k); + memcpy(cnt, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); + j = k >> 4 << 4; + for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) + x += __occ_aux4(bwt, *p); + x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} + +// an analogy to bwt_occ4() but more efficient, requiring k <= l +inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +{ + bwtint_t _k, _l; + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + bwt_occ4(bwt, k, cntk); + bwt_occ4(bwt, l, cntl); + } else { + bwtint_t i, j, x, y; + uint32_t *p; + if (k >= bwt->primary) --k; // because $ is not in bwt + if (l >= bwt->primary) --l; + p = bwt_occ_intv(bwt, k); + memcpy(cntk, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); + // prepare cntk[] + j = k >> 4 << 4; + for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) + x += __occ_aux4(bwt, *p); + y = x; + x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + // calculate cntl[] and finalize cntk[] + j = l >> 4 << 4; + for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); + y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); + cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; + cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; + } +} + +int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) +{ + bwtint_t k, l, ok, ol; + int i; + k = 0; l = bwt->seq_len; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) break; // no match + } + if (k > l) return 0; // no match + if (sa_begin) *sa_begin = k; + if (sa_end) *sa_end = l; + return l - k + 1; +} + +int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) +{ + int i; + bwtint_t k, l, ok, ol; + k = *k0; l = *l0; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // there is an N here. no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) return 0; // no match + } + *k0 = k; *l0 = l; + return l - k + 1; +} + +/********************* + * Bidirectional BWT * + *********************/ + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + for (i = 0; i != 4; ++i) { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; + ok[i].x[2] = tl[i] - tk[i]; + } + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; +} + +static void bwt_reverse_intvs(bwtintv_v *p) +{ + if (p->n > 1) { + int j; + for (j = 0; j < p->n>>1; ++j) { + bwtintv_t tmp = p->a[p->n - 1 - j]; + p->a[p->n - 1 - j] = p->a[j]; + p->a[j] = tmp; + } + } +} + +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + int i, j, c, ret; + bwtintv_t ik, ok[4]; + bwtintv_v a[2], *prev, *curr, *swap; + + mem->n = 0; + if (q[x] > 3) return x + 1; + kv_init(a[0]); kv_init(a[1]); + prev = tmpvec[0]? tmpvec[0] : &a[0]; + curr = tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); + ik.info = x + 1; + + for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search + if (q[i] < 4) { + c = 3 - q[i]; + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] == 0) break; // cannot be extended + ik = ok[c]; ik.info = i + 1; + } else { // an ambiguous base + kv_push(bwtintv_t, *curr, ik); + break; // cannot be extended; in this case, i<len always stands + } + } + if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end + bwt_reverse_intvs(curr); // s.t. smaller intervals visited first + ret = curr->a[0].info; // this will be the returned value + swap = curr; curr = prev; prev = swap; + + for (i = x - 1; i >= -1; --i) { // backward search for MEMs + if (q[i] > 3) break; + c = i < 0? 0 : q[i]; + for (j = 0, curr->n = 0; j < prev->n; ++j) { + bwtintv_t *p = &prev->a[j]; + bwt_extend(bwt, p, ok, 1); + if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches + ik = *p; ik.info |= (uint64_t)(i + 1)<<32; + kv_push(bwtintv_t, *mem, ik); + } + } // otherwise the match is contained in another longer match + } + if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + ok[c].info = p->info; + kv_push(bwtintv_t, *curr, ok[c]); + } + } + if (curr->n == 0) break; + swap = curr; curr = prev; prev = swap; + } + bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate + + if (tmpvec[0] == 0) free(a[0].a); + if (tmpvec[1] == 0) free(a[1].a); + return ret; +}