Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwt.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2008 Genome Research Ltd (GRL). | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
27 | |
28 #include <stdlib.h> | |
29 #include <stdio.h> | |
30 #include <string.h> | |
31 #include <assert.h> | |
32 #include <stdint.h> | |
33 #include "utils.h" | |
34 #include "bwt.h" | |
35 | |
36 void bwt_gen_cnt_table(bwt_t *bwt) | |
37 { | |
38 int i, j; | |
39 for (i = 0; i != 256; ++i) { | |
40 uint32_t x = 0; | |
41 for (j = 0; j != 4; ++j) | |
42 x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); | |
43 bwt->cnt_table[i] = x; | |
44 } | |
45 } | |
46 | |
47 // bwt->bwt and bwt->occ must be precalculated | |
48 void bwt_cal_sa(bwt_t *bwt, int intv) | |
49 { | |
50 bwtint_t isa, sa, i; // S(isa) = sa | |
51 | |
52 xassert(bwt->bwt, "bwt_t::bwt is not initialized."); | |
53 | |
54 if (bwt->sa) free(bwt->sa); | |
55 bwt->sa_intv = intv; | |
56 bwt->n_sa = (bwt->seq_len + intv) / intv; | |
57 bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); | |
58 // calculate SA value | |
59 isa = 0; sa = bwt->seq_len; | |
60 for (i = 0; i < bwt->seq_len; ++i) { | |
61 if (isa % intv == 0) bwt->sa[isa/intv] = sa; | |
62 --sa; | |
63 isa = bwt_invPsi(bwt, isa); | |
64 } | |
65 if (isa % intv == 0) bwt->sa[isa/intv] = sa; | |
66 bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len | |
67 } | |
68 | |
69 bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) | |
70 { | |
71 bwtint_t sa = 0; | |
72 while (k % bwt->sa_intv != 0) { | |
73 ++sa; | |
74 k = bwt_invPsi(bwt, k); | |
75 } | |
76 /* without setting bwt->sa[0] = -1, the following line should be | |
77 changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ | |
78 return sa + bwt->sa[k/bwt->sa_intv]; | |
79 } | |
80 | |
81 static inline int __occ_aux(uint64_t y, int c) | |
82 { | |
83 // reduce nucleotide counting to bits counting | |
84 y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; | |
85 // count the number of 1s in y | |
86 y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); | |
87 return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; | |
88 } | |
89 | |
90 inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) | |
91 { | |
92 bwtint_t n, l, j; | |
93 uint32_t *p; | |
94 | |
95 if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; | |
96 if (k == (bwtint_t)(-1)) return 0; | |
97 if (k >= bwt->primary) --k; // because $ is not in bwt | |
98 | |
99 // retrieve Occ at k/OCC_INTERVAL | |
100 n = (p = bwt_occ_intv(bwt, k))[c]; | |
101 p += 4; // jump to the start of the first BWT cell | |
102 | |
103 // calculate Occ up to the last k/32 | |
104 j = k >> 5 << 5; | |
105 for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) | |
106 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); | |
107 | |
108 // calculate Occ | |
109 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); | |
110 if (c == 0) n -= ~k&31; // corrected for the masked bits | |
111 | |
112 return n; | |
113 } | |
114 | |
115 // an analogy to bwt_occ() but more efficient, requiring k <= l | |
116 inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) | |
117 { | |
118 bwtint_t _k, _l; | |
119 if (k == l) { | |
120 *ok = *ol = bwt_occ(bwt, k, c); | |
121 return; | |
122 } | |
123 _k = (k >= bwt->primary)? k-1 : k; | |
124 _l = (l >= bwt->primary)? l-1 : l; | |
125 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { | |
126 *ok = bwt_occ(bwt, k, c); | |
127 *ol = bwt_occ(bwt, l, c); | |
128 } else { | |
129 bwtint_t m, n, i, j; | |
130 uint32_t *p; | |
131 if (k >= bwt->primary) --k; | |
132 if (l >= bwt->primary) --l; | |
133 n = (p = bwt_occ_intv(bwt, k))[c]; | |
134 p += 4; | |
135 // calculate *ok | |
136 j = k >> 5 << 5; | |
137 for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) | |
138 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); | |
139 m = n; | |
140 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); | |
141 if (c == 0) n -= ~k&31; // corrected for the masked bits | |
142 *ok = n; | |
143 // calculate *ol | |
144 j = l >> 5 << 5; | |
145 for (; i < j; i += 32, p += 2) | |
146 m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); | |
147 m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); | |
148 if (c == 0) m -= ~l&31; // corrected for the masked bits | |
149 *ol = m; | |
150 } | |
151 } | |
152 | |
153 #define __occ_aux4(bwt, b) \ | |
154 ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ | |
155 + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) | |
156 | |
157 inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) | |
158 { | |
159 bwtint_t l, j, x; | |
160 uint32_t *p; | |
161 if (k == (bwtint_t)(-1)) { | |
162 memset(cnt, 0, 4 * sizeof(bwtint_t)); | |
163 return; | |
164 } | |
165 if (k >= bwt->primary) --k; // because $ is not in bwt | |
166 p = bwt_occ_intv(bwt, k); | |
167 memcpy(cnt, p, 16); | |
168 p += 4; | |
169 j = k >> 4 << 4; | |
170 for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) | |
171 x += __occ_aux4(bwt, *p); | |
172 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); | |
173 cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; | |
174 } | |
175 | |
176 // an analogy to bwt_occ4() but more efficient, requiring k <= l | |
177 inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) | |
178 { | |
179 bwtint_t _k, _l; | |
180 if (k == l) { | |
181 bwt_occ4(bwt, k, cntk); | |
182 memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); | |
183 return; | |
184 } | |
185 _k = (k >= bwt->primary)? k-1 : k; | |
186 _l = (l >= bwt->primary)? l-1 : l; | |
187 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { | |
188 bwt_occ4(bwt, k, cntk); | |
189 bwt_occ4(bwt, l, cntl); | |
190 } else { | |
191 bwtint_t i, j, x, y; | |
192 uint32_t *p; | |
193 int cl[4]; | |
194 if (k >= bwt->primary) --k; // because $ is not in bwt | |
195 if (l >= bwt->primary) --l; | |
196 cl[0] = cl[1] = cl[2] = cl[3] = 0; | |
197 p = bwt_occ_intv(bwt, k); | |
198 memcpy(cntk, p, 4 * sizeof(bwtint_t)); | |
199 p += 4; | |
200 // prepare cntk[] | |
201 j = k >> 4 << 4; | |
202 for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) | |
203 x += __occ_aux4(bwt, *p); | |
204 y = x; | |
205 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); | |
206 // calculate cntl[] and finalize cntk[] | |
207 j = l >> 4 << 4; | |
208 for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); | |
209 y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); | |
210 memcpy(cntl, cntk, 16); | |
211 cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; | |
212 cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; | |
213 } | |
214 } | |
215 | |
216 int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) | |
217 { | |
218 bwtint_t k, l, ok, ol; | |
219 int i; | |
220 k = 0; l = bwt->seq_len; | |
221 for (i = len - 1; i >= 0; --i) { | |
222 ubyte_t c = str[i]; | |
223 if (c > 3) return 0; // no match | |
224 bwt_2occ(bwt, k - 1, l, c, &ok, &ol); | |
225 k = bwt->L2[c] + ok + 1; | |
226 l = bwt->L2[c] + ol; | |
227 if (k > l) break; // no match | |
228 } | |
229 if (k > l) return 0; // no match | |
230 if (sa_begin) *sa_begin = k; | |
231 if (sa_end) *sa_end = l; | |
232 return l - k + 1; | |
233 } | |
234 | |
235 int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) | |
236 { | |
237 int i; | |
238 bwtint_t k, l, ok, ol; | |
239 k = *k0; l = *l0; | |
240 for (i = len - 1; i >= 0; --i) { | |
241 ubyte_t c = str[i]; | |
242 if (c > 3) return 0; // there is an N here. no match | |
243 bwt_2occ(bwt, k - 1, l, c, &ok, &ol); | |
244 k = bwt->L2[c] + ok + 1; | |
245 l = bwt->L2[c] + ol; | |
246 if (k > l) return 0; // no match | |
247 } | |
248 *k0 = k; *l0 = l; | |
249 return l - k + 1; | |
250 } |