annotate bwa-0.6.2/bwt.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 /* The MIT License
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 Copyright (c) 2008 Genome Research Ltd (GRL).
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 Permission is hereby granted, free of charge, to any person obtaining
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 a copy of this software and associated documentation files (the
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 "Software"), to deal in the Software without restriction, including
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 without limitation the rights to use, copy, modify, merge, publish,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 distribute, sublicense, and/or sell copies of the Software, and to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10 permit persons to whom the Software is furnished to do so, subject to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 the following conditions:
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 The above copyright notice and this permission notice shall be
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 included in all copies or substantial portions of the Software.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 SOFTWARE.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 #include <stdlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 #include <stdio.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30 #include <string.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 #include <assert.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 #include <stdint.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 #include "utils.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 #include "bwt.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 #include "kvec.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 void bwt_gen_cnt_table(bwt_t *bwt)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 int i, j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 for (i = 0; i != 256; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 uint32_t x = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42 for (j = 0; j != 4; ++j)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 bwt->cnt_table[i] = x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 // bwt->bwt and bwt->occ must be precalculated
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 void bwt_cal_sa(bwt_t *bwt, int intv)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 bwtint_t isa, sa, i; // S(isa) = sa
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 int intv_round = intv;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 kv_roundup32(intv_round);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 xassert(intv_round == intv, "SA sample interval is not a power of 2.");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56 xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 if (bwt->sa) free(bwt->sa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 bwt->sa_intv = intv;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 bwt->n_sa = (bwt->seq_len + intv) / intv;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 if (bwt->sa == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 abort();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 // calculate SA value
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 isa = 0; sa = bwt->seq_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 for (i = 0; i < bwt->seq_len; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 if (isa % intv == 0) bwt->sa[isa/intv] = sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 --sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 isa = bwt_invPsi(bwt, isa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 if (isa % intv == 0) bwt->sa[isa/intv] = sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 bwtint_t sa = 0, mask = bwt->sa_intv - 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 while (k & mask) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 ++sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 k = bwt_invPsi(bwt, k);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 /* without setting bwt->sa[0] = -1, the following line should be
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 return sa + bwt->sa[k/bwt->sa_intv];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89 static inline int __occ_aux(uint64_t y, int c)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 // reduce nucleotide counting to bits counting
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92 y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 // count the number of 1s in y
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 bwtint_t n, l, j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 uint32_t *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 if (k == (bwtint_t)(-1)) return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 if (k >= bwt->primary) --k; // because $ is not in bwt
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 // retrieve Occ at k/OCC_INTERVAL
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 p += sizeof(bwtint_t); // jump to the start of the first BWT cell
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 // calculate Occ up to the last k/32
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 j = k >> 5 << 5;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 // calculate Occ
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118 if (c == 0) n -= ~k&31; // corrected for the masked bits
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120 return n;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 // an analogy to bwt_occ() but more efficient, requiring k <= l
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 bwtint_t _k, _l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 _k = (k >= bwt->primary)? k-1 : k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
128 _l = (l >= bwt->primary)? l-1 : l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
129 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
130 *ok = bwt_occ(bwt, k, c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
131 *ol = bwt_occ(bwt, l, c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
132 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
133 bwtint_t m, n, i, j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
134 uint32_t *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
135 if (k >= bwt->primary) --k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
136 if (l >= bwt->primary) --l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
137 n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
138 p += sizeof(bwtint_t);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
139 // calculate *ok
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
140 j = k >> 5 << 5;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
141 for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
142 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
143 m = n;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
144 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
145 if (c == 0) n -= ~k&31; // corrected for the masked bits
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
146 *ok = n;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
147 // calculate *ol
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
148 j = l >> 5 << 5;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
149 for (; i < j; i += 32, p += 2)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
150 m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
151 m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
152 if (c == 0) m -= ~l&31; // corrected for the masked bits
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
153 *ol = m;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
154 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
155 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
156
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
157 #define __occ_aux4(bwt, b) \
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
158 ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
159 + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
160
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
161 inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
162 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
163 bwtint_t l, j, x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
164 uint32_t *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
165 if (k == (bwtint_t)(-1)) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
166 memset(cnt, 0, 4 * sizeof(bwtint_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
167 return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
168 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
169 if (k >= bwt->primary) --k; // because $ is not in bwt
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
170 p = bwt_occ_intv(bwt, k);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
171 memcpy(cnt, p, 4 * sizeof(bwtint_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
172 p += sizeof(bwtint_t);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
173 j = k >> 4 << 4;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
174 for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
175 x += __occ_aux4(bwt, *p);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
176 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
177 cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
178 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
179
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
180 // an analogy to bwt_occ4() but more efficient, requiring k <= l
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
181 inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
182 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
183 bwtint_t _k, _l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
184 _k = (k >= bwt->primary)? k-1 : k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
185 _l = (l >= bwt->primary)? l-1 : l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
186 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
187 bwt_occ4(bwt, k, cntk);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
188 bwt_occ4(bwt, l, cntl);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
189 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
190 bwtint_t i, j, x, y;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
191 uint32_t *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
192 if (k >= bwt->primary) --k; // because $ is not in bwt
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
193 if (l >= bwt->primary) --l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
194 p = bwt_occ_intv(bwt, k);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
195 memcpy(cntk, p, 4 * sizeof(bwtint_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
196 p += sizeof(bwtint_t);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
197 // prepare cntk[]
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
198 j = k >> 4 << 4;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
199 for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
200 x += __occ_aux4(bwt, *p);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
201 y = x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
202 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
203 // calculate cntl[] and finalize cntk[]
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
204 j = l >> 4 << 4;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
205 for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
206 y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
207 memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
208 cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
209 cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
210 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
211 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
212
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
213 int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
214 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
215 bwtint_t k, l, ok, ol;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
216 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
217 k = 0; l = bwt->seq_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
218 for (i = len - 1; i >= 0; --i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
219 ubyte_t c = str[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
220 if (c > 3) return 0; // no match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
221 bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
222 k = bwt->L2[c] + ok + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
223 l = bwt->L2[c] + ol;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
224 if (k > l) break; // no match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
225 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
226 if (k > l) return 0; // no match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
227 if (sa_begin) *sa_begin = k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
228 if (sa_end) *sa_end = l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
229 return l - k + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
230 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
231
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
232 int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
233 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
234 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
235 bwtint_t k, l, ok, ol;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
236 k = *k0; l = *l0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
237 for (i = len - 1; i >= 0; --i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
238 ubyte_t c = str[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
239 if (c > 3) return 0; // there is an N here. no match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
240 bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
241 k = bwt->L2[c] + ok + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
242 l = bwt->L2[c] + ol;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
243 if (k > l) return 0; // no match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
244 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
245 *k0 = k; *l0 = l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
246 return l - k + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
247 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
248
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
249 /*********************
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
250 * Bidirectional BWT *
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
251 *********************/
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
252
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
253 void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
254 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
255 bwtint_t tk[4], tl[4];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
256 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
257 bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
258 for (i = 0; i != 4; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
259 ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
260 ok[i].x[2] = tl[i] - tk[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
261 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
262 ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
263 ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
264 ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
265 ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
266 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
267
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
268 static void bwt_reverse_intvs(bwtintv_v *p)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
269 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
270 if (p->n > 1) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
271 int j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
272 for (j = 0; j < p->n>>1; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
273 bwtintv_t tmp = p->a[p->n - 1 - j];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
274 p->a[p->n - 1 - j] = p->a[j];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
275 p->a[j] = tmp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
276 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
277 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
278 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
279
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
280 int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
281 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
282 int i, j, c, ret;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
283 bwtintv_t ik, ok[4];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
284 bwtintv_v a[2], *prev, *curr, *swap;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
285
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
286 mem->n = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
287 if (q[x] > 3) return x + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
288 kv_init(a[0]); kv_init(a[1]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
289 prev = tmpvec[0]? tmpvec[0] : &a[0];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
290 curr = tmpvec[1]? tmpvec[1] : &a[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
291 bwt_set_intv(bwt, q[x], ik);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
292 ik.info = x + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
293
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
294 for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
295 if (q[i] < 4) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
296 c = 3 - q[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
297 bwt_extend(bwt, &ik, ok, 0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
298 if (ok[c].x[2] != ik.x[2]) // change of the interval size
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
299 kv_push(bwtintv_t, *curr, ik);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
300 if (ok[c].x[2] == 0) break; // cannot be extended
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
301 ik = ok[c]; ik.info = i + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
302 } else { // an ambiguous base
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
303 kv_push(bwtintv_t, *curr, ik);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
304 break; // cannot be extended; in this case, i<len always stands
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
305 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
306 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
307 if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
308 bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
309 ret = curr->a[0].info; // this will be the returned value
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
310 swap = curr; curr = prev; prev = swap;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
311
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
312 for (i = x - 1; i >= -1; --i) { // backward search for MEMs
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
313 if (q[i] > 3) break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
314 c = i < 0? 0 : q[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
315 for (j = 0, curr->n = 0; j < prev->n; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
316 bwtintv_t *p = &prev->a[j];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
317 bwt_extend(bwt, p, ok, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
318 if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
319 if (curr->n == 0) { // curr->n to make sure there is no longer matches
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
320 if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
321 ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
322 kv_push(bwtintv_t, *mem, ik);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
323 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
324 } // otherwise the match is contained in another longer match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
325 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
326 if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
327 ok[c].info = p->info;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
328 kv_push(bwtintv_t, *curr, ok[c]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
329 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
330 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
331 if (curr->n == 0) break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
332 swap = curr; curr = prev; prev = swap;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
333 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
334 bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
335
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
336 if (tmpvec[0] == 0) free(a[0].a);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
337 if (tmpvec[1] == 0) free(a[1].a);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
338 return ret;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
339 }