Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/ksw.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
comparison
equal
deleted
inserted
replaced
1:a9636dc1e99a | 2:a294fbfcb1db |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk> | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 #ifndef _NO_SSE2 | |
27 #include <stdlib.h> | |
28 #include <stdint.h> | |
29 #include <emmintrin.h> | |
30 #include "ksw.h" | |
31 | |
32 #ifdef __GNUC__ | |
33 #define LIKELY(x) __builtin_expect((x),1) | |
34 #define UNLIKELY(x) __builtin_expect((x),0) | |
35 #else | |
36 #define LIKELY(x) (x) | |
37 #define UNLIKELY(x) (x) | |
38 #endif | |
39 | |
40 struct _ksw_query_t { | |
41 int qlen, slen; | |
42 uint8_t shift, mdiff, max, size; | |
43 __m128i *qp, *H0, *H1, *E, *Hmax; | |
44 }; | |
45 | |
46 ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) | |
47 { | |
48 ksw_query_t *q; | |
49 int slen, a, tmp, p; | |
50 | |
51 size = size > 1? 2 : 1; | |
52 p = 8 * (3 - size); // # values per __m128i | |
53 slen = (qlen + p - 1) / p; // segmented length | |
54 q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory | |
55 q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory | |
56 q->H0 = q->qp + slen * m; | |
57 q->H1 = q->H0 + slen; | |
58 q->E = q->H1 + slen; | |
59 q->Hmax = q->E + slen; | |
60 q->slen = slen; q->qlen = qlen; q->size = size; | |
61 // compute shift | |
62 tmp = m * m; | |
63 for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score | |
64 if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; | |
65 if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; | |
66 } | |
67 q->max = q->mdiff; | |
68 q->shift = 256 - q->shift; // NB: q->shift is uint8_t | |
69 q->mdiff += q->shift; // this is the difference between the min and max scores | |
70 // An example: p=8, qlen=19, slen=3 and segmentation: | |
71 // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} | |
72 if (size == 1) { | |
73 int8_t *t = (int8_t*)q->qp; | |
74 for (a = 0; a < m; ++a) { | |
75 int i, k, nlen = slen * p; | |
76 const int8_t *ma = mat + a * m; | |
77 for (i = 0; i < slen; ++i) | |
78 for (k = i; k < nlen; k += slen) // p iterations | |
79 *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; | |
80 } | |
81 } else { | |
82 int16_t *t = (int16_t*)q->qp; | |
83 for (a = 0; a < m; ++a) { | |
84 int i, k, nlen = slen * p; | |
85 const int8_t *ma = mat + a * m; | |
86 for (i = 0; i < slen; ++i) | |
87 for (k = i; k < nlen; k += slen) // p iterations | |
88 *t++ = (k >= qlen? 0 : ma[query[k]]); | |
89 } | |
90 } | |
91 return q; | |
92 } | |
93 | |
94 int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) | |
95 { | |
96 int slen, i, m_b, n_b, te = -1, gmax = 0; | |
97 uint64_t *b; | |
98 __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; | |
99 | |
100 #define __max_16(ret, xx) do { \ | |
101 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ | |
102 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ | |
103 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ | |
104 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ | |
105 (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ | |
106 } while (0) | |
107 | |
108 // initialization | |
109 m_b = n_b = 0; b = 0; | |
110 zero = _mm_set1_epi32(0); | |
111 gapoe = _mm_set1_epi8(a->gapo + a->gape); | |
112 gape = _mm_set1_epi8(a->gape); | |
113 shift = _mm_set1_epi8(q->shift); | |
114 H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; | |
115 slen = q->slen; | |
116 for (i = 0; i < slen; ++i) { | |
117 _mm_store_si128(E + i, zero); | |
118 _mm_store_si128(H0 + i, zero); | |
119 _mm_store_si128(Hmax + i, zero); | |
120 } | |
121 // the core loop | |
122 for (i = 0; i < tlen; ++i) { | |
123 int j, k, cmp, imax; | |
124 __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector | |
125 h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example | |
126 h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian | |
127 for (j = 0; LIKELY(j < slen); ++j) { | |
128 /* SW cells are computed in the following order: | |
129 * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} | |
130 * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} | |
131 * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} | |
132 */ | |
133 // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) | |
134 h = _mm_adds_epu8(h, _mm_load_si128(S + j)); | |
135 h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) | |
136 e = _mm_load_si128(E + j); // e=E'(i,j) | |
137 h = _mm_max_epu8(h, e); | |
138 h = _mm_max_epu8(h, f); // h=H'(i,j) | |
139 max = _mm_max_epu8(max, h); // set max | |
140 _mm_store_si128(H1 + j, h); // save to H'(i,j) | |
141 // now compute E'(i+1,j) | |
142 h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo | |
143 e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape | |
144 e = _mm_max_epu8(e, h); // e=E'(i+1,j) | |
145 _mm_store_si128(E + j, e); // save to E'(i+1,j) | |
146 // now compute F'(i,j+1) | |
147 f = _mm_subs_epu8(f, gape); | |
148 f = _mm_max_epu8(f, h); | |
149 // get H'(i-1,j) and prepare for the next j | |
150 h = _mm_load_si128(H0 + j); // h=H'(i-1,j) | |
151 } | |
152 // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion | |
153 for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max | |
154 f = _mm_slli_si128(f, 1); | |
155 for (j = 0; LIKELY(j < slen); ++j) { | |
156 h = _mm_load_si128(H1 + j); | |
157 h = _mm_max_epu8(h, f); // h=H'(i,j) | |
158 _mm_store_si128(H1 + j, h); | |
159 h = _mm_subs_epu8(h, gapoe); | |
160 f = _mm_subs_epu8(f, gape); | |
161 cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); | |
162 if (UNLIKELY(cmp == 0xffff)) goto end_loop16; | |
163 } | |
164 } | |
165 end_loop16: | |
166 //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); | |
167 __max_16(imax, max); // imax is the maximum number in max | |
168 if (imax >= a->T) { // write the b array; this condition adds branching unfornately | |
169 if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append | |
170 if (n_b == m_b) { | |
171 m_b = m_b? m_b<<1 : 8; | |
172 b = realloc(b, 8 * m_b); | |
173 } | |
174 b[n_b++] = (uint64_t)imax<<32 | i; | |
175 } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last | |
176 } | |
177 if (imax > gmax) { | |
178 gmax = imax; te = i; // te is the end position on the target | |
179 for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector | |
180 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); | |
181 if (gmax + q->shift >= 255) break; | |
182 } | |
183 S = H1; H1 = H0; H0 = S; // swap H0 and H1 | |
184 } | |
185 a->score = gmax; a->te = te; | |
186 { // get a->qe, the end of query match; find the 2nd best score | |
187 int max = -1, low, high, qlen = slen * 16; | |
188 uint8_t *t = (uint8_t*)Hmax; | |
189 for (i = 0, a->qe = -1; i < qlen; ++i, ++t) | |
190 if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; | |
191 //printf("%d,%d\n", max, gmax); | |
192 i = (a->score + q->max - 1) / q->max; | |
193 low = te - i; high = te + i; | |
194 for (i = 0, a->score2 = 0; i < n_b; ++i) { | |
195 int e = (int32_t)b[i]; | |
196 if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) | |
197 a->score2 = b[i]>>32, a->te2 = e; | |
198 } | |
199 } | |
200 free(b); | |
201 return a->score + q->shift >= 255? 255 : a->score; | |
202 } | |
203 | |
204 int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) | |
205 { | |
206 int slen, i, m_b, n_b, te = -1, gmax = 0; | |
207 uint64_t *b; | |
208 __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; | |
209 | |
210 #define __max_8(ret, xx) do { \ | |
211 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ | |
212 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ | |
213 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ | |
214 (ret) = _mm_extract_epi16((xx), 0); \ | |
215 } while (0) | |
216 | |
217 // initialization | |
218 m_b = n_b = 0; b = 0; | |
219 zero = _mm_set1_epi32(0); | |
220 gapoe = _mm_set1_epi16(a->gapo + a->gape); | |
221 gape = _mm_set1_epi16(a->gape); | |
222 H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; | |
223 slen = q->slen; | |
224 for (i = 0; i < slen; ++i) { | |
225 _mm_store_si128(E + i, zero); | |
226 _mm_store_si128(H0 + i, zero); | |
227 _mm_store_si128(Hmax + i, zero); | |
228 } | |
229 // the core loop | |
230 for (i = 0; i < tlen; ++i) { | |
231 int j, k, imax; | |
232 __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector | |
233 h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example | |
234 h = _mm_slli_si128(h, 2); | |
235 for (j = 0; LIKELY(j < slen); ++j) { | |
236 h = _mm_adds_epi16(h, *S++); | |
237 e = _mm_load_si128(E + j); | |
238 h = _mm_max_epi16(h, e); | |
239 h = _mm_max_epi16(h, f); | |
240 max = _mm_max_epi16(max, h); | |
241 _mm_store_si128(H1 + j, h); | |
242 h = _mm_subs_epu16(h, gapoe); | |
243 e = _mm_subs_epu16(e, gape); | |
244 e = _mm_max_epi16(e, h); | |
245 _mm_store_si128(E + j, e); | |
246 f = _mm_subs_epu16(f, gape); | |
247 f = _mm_max_epi16(f, h); | |
248 h = _mm_load_si128(H0 + j); | |
249 } | |
250 for (k = 0; LIKELY(k < 16); ++k) { | |
251 f = _mm_slli_si128(f, 2); | |
252 for (j = 0; LIKELY(j < slen); ++j) { | |
253 h = _mm_load_si128(H1 + j); | |
254 h = _mm_max_epi16(h, f); | |
255 _mm_store_si128(H1 + j, h); | |
256 h = _mm_subs_epu16(h, gapoe); | |
257 f = _mm_subs_epu16(f, gape); | |
258 if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; | |
259 } | |
260 } | |
261 end_loop8: | |
262 __max_8(imax, max); | |
263 if (imax >= a->T) { | |
264 if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { | |
265 if (n_b == m_b) { | |
266 m_b = m_b? m_b<<1 : 8; | |
267 b = realloc(b, 8 * m_b); | |
268 } | |
269 b[n_b++] = (uint64_t)imax<<32 | i; | |
270 } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last | |
271 } | |
272 if (imax > gmax) { | |
273 gmax = imax; te = i; | |
274 for (j = 0; LIKELY(j < slen); ++j) | |
275 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); | |
276 } | |
277 S = H1; H1 = H0; H0 = S; | |
278 } | |
279 a->score = gmax; a->te = te; | |
280 { | |
281 int max = -1, low, high, qlen = slen * 8; | |
282 uint16_t *t = (uint16_t*)Hmax; | |
283 for (i = 0, a->qe = -1; i < qlen; ++i, ++t) | |
284 if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; | |
285 i = (a->score + q->max - 1) / q->max; | |
286 low = te - i; high = te + i; | |
287 for (i = 0, a->score2 = 0; i < n_b; ++i) { | |
288 int e = (int32_t)b[i]; | |
289 if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) | |
290 a->score2 = b[i]>>32, a->te2 = e; | |
291 } | |
292 } | |
293 free(b); | |
294 return a->score; | |
295 } | |
296 | |
297 int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) | |
298 { | |
299 if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); | |
300 else return ksw_sse2_8(q, tlen, target, a); | |
301 } | |
302 | |
303 /******************************************* | |
304 * Main function (not compiled by default) * | |
305 *******************************************/ | |
306 | |
307 #ifdef _KSW_MAIN | |
308 | |
309 #include <unistd.h> | |
310 #include <stdio.h> | |
311 #include <zlib.h> | |
312 #include "kseq.h" | |
313 KSEQ_INIT(gzFile, gzread) | |
314 | |
315 unsigned char seq_nt4_table[256] = { | |
316 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
317 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
318 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
319 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
320 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
321 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
322 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
323 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
324 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
325 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
326 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
327 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
328 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
329 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
330 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
331 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
332 }; | |
333 | |
334 int main(int argc, char *argv[]) | |
335 { | |
336 int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; | |
337 int8_t mat[25]; | |
338 ksw_aux_t a; | |
339 gzFile fpt, fpq; | |
340 kseq_t *kst, *ksq; | |
341 // parse command line | |
342 a.gapo = 5; a.gape = 2; a.T = 10; | |
343 while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { | |
344 switch (c) { | |
345 case 'a': sa = atoi(optarg); break; | |
346 case 'b': sb = atoi(optarg); break; | |
347 case 'q': a.gapo = atoi(optarg); break; | |
348 case 'r': a.gape = atoi(optarg); break; | |
349 case 't': a.T = atoi(optarg); break; | |
350 case 'f': forward_only = 1; break; | |
351 case 's': size = atoi(optarg); break; | |
352 } | |
353 } | |
354 if (optind + 2 > argc) { | |
355 fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] <target.fa> <query.fa>\n", size, sa, sb, a.gapo, a.gape); | |
356 return 1; | |
357 } | |
358 // initialize scoring matrix | |
359 for (i = k = 0; i < 5; ++i) { | |
360 for (j = 0; j < 4; ++j) | |
361 mat[k++] = i == j? sa : -sb; | |
362 mat[k++] = 0; // ambiguous base | |
363 } | |
364 for (j = 0; j < 5; ++j) mat[k++] = 0; | |
365 // open file | |
366 fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); | |
367 fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); | |
368 // all-pair alignment | |
369 while (kseq_read(ksq) > 0) { | |
370 ksw_query_t *q[2]; | |
371 for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; | |
372 q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); | |
373 if (!forward_only) { // reverse | |
374 for (i = 0; i < ksq->seq.l/2; ++i) { | |
375 int t = ksq->seq.s[i]; | |
376 ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; | |
377 ksq->seq.s[ksq->seq.l-1-i] = t; | |
378 } | |
379 for (i = 0; i < ksq->seq.l; ++i) | |
380 ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; | |
381 q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); | |
382 } else q[1] = 0; | |
383 gzrewind(fpt); kseq_rewind(kst); | |
384 while (kseq_read(kst) > 0) { | |
385 int s; | |
386 for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; | |
387 s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); | |
388 printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); | |
389 if (q[1]) { | |
390 s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); | |
391 printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); | |
392 } | |
393 } | |
394 free(q[0]); free(q[1]); | |
395 } | |
396 kseq_destroy(kst); gzclose(fpt); | |
397 kseq_destroy(ksq); gzclose(fpq); | |
398 return 0; | |
399 } | |
400 #endif // _KSW_MAIN | |
401 #endif // _NO_SSE2 |