comparison bwa-0.6.2/ksw.c @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dd1186b11b3b
1 /* The MIT License
2
3 Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 #ifndef _NO_SSE2
27 #include <stdlib.h>
28 #include <stdint.h>
29 #include <emmintrin.h>
30 #include "ksw.h"
31
32 #ifdef __GNUC__
33 #define LIKELY(x) __builtin_expect((x),1)
34 #define UNLIKELY(x) __builtin_expect((x),0)
35 #else
36 #define LIKELY(x) (x)
37 #define UNLIKELY(x) (x)
38 #endif
39
40 struct _ksw_query_t {
41 int qlen, slen;
42 uint8_t shift, mdiff, max, size;
43 __m128i *qp, *H0, *H1, *E, *Hmax;
44 };
45
46 ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
47 {
48 ksw_query_t *q;
49 int slen, a, tmp, p;
50
51 size = size > 1? 2 : 1;
52 p = 8 * (3 - size); // # values per __m128i
53 slen = (qlen + p - 1) / p; // segmented length
54 q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
55 q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory
56 q->H0 = q->qp + slen * m;
57 q->H1 = q->H0 + slen;
58 q->E = q->H1 + slen;
59 q->Hmax = q->E + slen;
60 q->slen = slen; q->qlen = qlen; q->size = size;
61 // compute shift
62 tmp = m * m;
63 for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
64 if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
65 if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
66 }
67 q->max = q->mdiff;
68 q->shift = 256 - q->shift; // NB: q->shift is uint8_t
69 q->mdiff += q->shift; // this is the difference between the min and max scores
70 // An example: p=8, qlen=19, slen=3 and segmentation:
71 // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
72 if (size == 1) {
73 int8_t *t = (int8_t*)q->qp;
74 for (a = 0; a < m; ++a) {
75 int i, k, nlen = slen * p;
76 const int8_t *ma = mat + a * m;
77 for (i = 0; i < slen; ++i)
78 for (k = i; k < nlen; k += slen) // p iterations
79 *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
80 }
81 } else {
82 int16_t *t = (int16_t*)q->qp;
83 for (a = 0; a < m; ++a) {
84 int i, k, nlen = slen * p;
85 const int8_t *ma = mat + a * m;
86 for (i = 0; i < slen; ++i)
87 for (k = i; k < nlen; k += slen) // p iterations
88 *t++ = (k >= qlen? 0 : ma[query[k]]);
89 }
90 }
91 return q;
92 }
93
94 int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
95 {
96 int slen, i, m_b, n_b, te = -1, gmax = 0;
97 uint64_t *b;
98 __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
99
100 #define __max_16(ret, xx) do { \
101 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
102 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
103 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
104 (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
105 (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
106 } while (0)
107
108 // initialization
109 m_b = n_b = 0; b = 0;
110 zero = _mm_set1_epi32(0);
111 gapoe = _mm_set1_epi8(a->gapo + a->gape);
112 gape = _mm_set1_epi8(a->gape);
113 shift = _mm_set1_epi8(q->shift);
114 H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
115 slen = q->slen;
116 for (i = 0; i < slen; ++i) {
117 _mm_store_si128(E + i, zero);
118 _mm_store_si128(H0 + i, zero);
119 _mm_store_si128(Hmax + i, zero);
120 }
121 // the core loop
122 for (i = 0; i < tlen; ++i) {
123 int j, k, cmp, imax;
124 __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
125 h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
126 h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
127 for (j = 0; LIKELY(j < slen); ++j) {
128 /* SW cells are computed in the following order:
129 * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
130 * E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
131 * F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
132 */
133 // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
134 h = _mm_adds_epu8(h, _mm_load_si128(S + j));
135 h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
136 e = _mm_load_si128(E + j); // e=E'(i,j)
137 h = _mm_max_epu8(h, e);
138 h = _mm_max_epu8(h, f); // h=H'(i,j)
139 max = _mm_max_epu8(max, h); // set max
140 _mm_store_si128(H1 + j, h); // save to H'(i,j)
141 // now compute E'(i+1,j)
142 h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
143 e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
144 e = _mm_max_epu8(e, h); // e=E'(i+1,j)
145 _mm_store_si128(E + j, e); // save to E'(i+1,j)
146 // now compute F'(i,j+1)
147 f = _mm_subs_epu8(f, gape);
148 f = _mm_max_epu8(f, h);
149 // get H'(i-1,j) and prepare for the next j
150 h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
151 }
152 // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
153 for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
154 f = _mm_slli_si128(f, 1);
155 for (j = 0; LIKELY(j < slen); ++j) {
156 h = _mm_load_si128(H1 + j);
157 h = _mm_max_epu8(h, f); // h=H'(i,j)
158 _mm_store_si128(H1 + j, h);
159 h = _mm_subs_epu8(h, gapoe);
160 f = _mm_subs_epu8(f, gape);
161 cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
162 if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
163 }
164 }
165 end_loop16:
166 //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
167 __max_16(imax, max); // imax is the maximum number in max
168 if (imax >= a->T) { // write the b array; this condition adds branching unfornately
169 if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
170 if (n_b == m_b) {
171 m_b = m_b? m_b<<1 : 8;
172 b = realloc(b, 8 * m_b);
173 }
174 b[n_b++] = (uint64_t)imax<<32 | i;
175 } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
176 }
177 if (imax > gmax) {
178 gmax = imax; te = i; // te is the end position on the target
179 for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
180 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
181 if (gmax + q->shift >= 255) break;
182 }
183 S = H1; H1 = H0; H0 = S; // swap H0 and H1
184 }
185 a->score = gmax; a->te = te;
186 { // get a->qe, the end of query match; find the 2nd best score
187 int max = -1, low, high, qlen = slen * 16;
188 uint8_t *t = (uint8_t*)Hmax;
189 for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
190 if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen;
191 //printf("%d,%d\n", max, gmax);
192 i = (a->score + q->max - 1) / q->max;
193 low = te - i; high = te + i;
194 for (i = 0, a->score2 = 0; i < n_b; ++i) {
195 int e = (int32_t)b[i];
196 if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
197 a->score2 = b[i]>>32, a->te2 = e;
198 }
199 }
200 free(b);
201 return a->score + q->shift >= 255? 255 : a->score;
202 }
203
204 int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
205 {
206 int slen, i, m_b, n_b, te = -1, gmax = 0;
207 uint64_t *b;
208 __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
209
210 #define __max_8(ret, xx) do { \
211 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
212 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
213 (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
214 (ret) = _mm_extract_epi16((xx), 0); \
215 } while (0)
216
217 // initialization
218 m_b = n_b = 0; b = 0;
219 zero = _mm_set1_epi32(0);
220 gapoe = _mm_set1_epi16(a->gapo + a->gape);
221 gape = _mm_set1_epi16(a->gape);
222 H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
223 slen = q->slen;
224 for (i = 0; i < slen; ++i) {
225 _mm_store_si128(E + i, zero);
226 _mm_store_si128(H0 + i, zero);
227 _mm_store_si128(Hmax + i, zero);
228 }
229 // the core loop
230 for (i = 0; i < tlen; ++i) {
231 int j, k, imax;
232 __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
233 h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
234 h = _mm_slli_si128(h, 2);
235 for (j = 0; LIKELY(j < slen); ++j) {
236 h = _mm_adds_epi16(h, *S++);
237 e = _mm_load_si128(E + j);
238 h = _mm_max_epi16(h, e);
239 h = _mm_max_epi16(h, f);
240 max = _mm_max_epi16(max, h);
241 _mm_store_si128(H1 + j, h);
242 h = _mm_subs_epu16(h, gapoe);
243 e = _mm_subs_epu16(e, gape);
244 e = _mm_max_epi16(e, h);
245 _mm_store_si128(E + j, e);
246 f = _mm_subs_epu16(f, gape);
247 f = _mm_max_epi16(f, h);
248 h = _mm_load_si128(H0 + j);
249 }
250 for (k = 0; LIKELY(k < 16); ++k) {
251 f = _mm_slli_si128(f, 2);
252 for (j = 0; LIKELY(j < slen); ++j) {
253 h = _mm_load_si128(H1 + j);
254 h = _mm_max_epi16(h, f);
255 _mm_store_si128(H1 + j, h);
256 h = _mm_subs_epu16(h, gapoe);
257 f = _mm_subs_epu16(f, gape);
258 if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
259 }
260 }
261 end_loop8:
262 __max_8(imax, max);
263 if (imax >= a->T) {
264 if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
265 if (n_b == m_b) {
266 m_b = m_b? m_b<<1 : 8;
267 b = realloc(b, 8 * m_b);
268 }
269 b[n_b++] = (uint64_t)imax<<32 | i;
270 } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
271 }
272 if (imax > gmax) {
273 gmax = imax; te = i;
274 for (j = 0; LIKELY(j < slen); ++j)
275 _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
276 }
277 S = H1; H1 = H0; H0 = S;
278 }
279 a->score = gmax; a->te = te;
280 {
281 int max = -1, low, high, qlen = slen * 8;
282 uint16_t *t = (uint16_t*)Hmax;
283 for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
284 if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen;
285 i = (a->score + q->max - 1) / q->max;
286 low = te - i; high = te + i;
287 for (i = 0, a->score2 = 0; i < n_b; ++i) {
288 int e = (int32_t)b[i];
289 if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
290 a->score2 = b[i]>>32, a->te2 = e;
291 }
292 }
293 free(b);
294 return a->score;
295 }
296
297 int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a)
298 {
299 if (q->size == 1) return ksw_sse2_16(q, tlen, target, a);
300 else return ksw_sse2_8(q, tlen, target, a);
301 }
302
303 /*******************************************
304 * Main function (not compiled by default) *
305 *******************************************/
306
307 #ifdef _KSW_MAIN
308
309 #include <unistd.h>
310 #include <stdio.h>
311 #include <zlib.h>
312 #include "kseq.h"
313 KSEQ_INIT(gzFile, gzread)
314
315 unsigned char seq_nt4_table[256] = {
316 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
317 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
318 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
319 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
320 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
321 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
322 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
323 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
324 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
325 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
326 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
327 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
328 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
329 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
330 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
331 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
332 };
333
334 int main(int argc, char *argv[])
335 {
336 int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2;
337 int8_t mat[25];
338 ksw_aux_t a;
339 gzFile fpt, fpq;
340 kseq_t *kst, *ksq;
341 // parse command line
342 a.gapo = 5; a.gape = 2; a.T = 10;
343 while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) {
344 switch (c) {
345 case 'a': sa = atoi(optarg); break;
346 case 'b': sb = atoi(optarg); break;
347 case 'q': a.gapo = atoi(optarg); break;
348 case 'r': a.gape = atoi(optarg); break;
349 case 't': a.T = atoi(optarg); break;
350 case 'f': forward_only = 1; break;
351 case 's': size = atoi(optarg); break;
352 }
353 }
354 if (optind + 2 > argc) {
355 fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] <target.fa> <query.fa>\n", size, sa, sb, a.gapo, a.gape);
356 return 1;
357 }
358 // initialize scoring matrix
359 for (i = k = 0; i < 5; ++i) {
360 for (j = 0; j < 4; ++j)
361 mat[k++] = i == j? sa : -sb;
362 mat[k++] = 0; // ambiguous base
363 }
364 for (j = 0; j < 5; ++j) mat[k++] = 0;
365 // open file
366 fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt);
367 fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
368 // all-pair alignment
369 while (kseq_read(ksq) > 0) {
370 ksw_query_t *q[2];
371 for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
372 q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
373 if (!forward_only) { // reverse
374 for (i = 0; i < ksq->seq.l/2; ++i) {
375 int t = ksq->seq.s[i];
376 ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i];
377 ksq->seq.s[ksq->seq.l-1-i] = t;
378 }
379 for (i = 0; i < ksq->seq.l; ++i)
380 ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
381 q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
382 } else q[1] = 0;
383 gzrewind(fpt); kseq_rewind(kst);
384 while (kseq_read(kst) > 0) {
385 int s;
386 for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
387 s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a);
388 printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
389 if (q[1]) {
390 s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a);
391 printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
392 }
393 }
394 free(q[0]); free(q[1]);
395 }
396 kseq_destroy(kst); gzclose(fpt);
397 kseq_destroy(ksq); gzclose(fpq);
398 return 0;
399 }
400 #endif // _KSW_MAIN
401 #endif // _NO_SSE2