Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/bwtgap.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
comparison
equal
deleted
inserted
replaced
1:a9636dc1e99a | 2:a294fbfcb1db |
---|---|
1 #include <stdio.h> | |
2 #include <stdlib.h> | |
3 #include <string.h> | |
4 #include "bwtgap.h" | |
5 #include "bwtaln.h" | |
6 | |
7 #define STATE_M 0 | |
8 #define STATE_I 1 | |
9 #define STATE_D 2 | |
10 | |
11 #define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) | |
12 | |
13 gap_stack_t *gap_init_stack2(int max_score) | |
14 { | |
15 gap_stack_t *stack; | |
16 stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); | |
17 stack->n_stacks = max_score; | |
18 stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); | |
19 return stack; | |
20 } | |
21 | |
22 gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) | |
23 { | |
24 return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); | |
25 } | |
26 | |
27 void gap_destroy_stack(gap_stack_t *stack) | |
28 { | |
29 int i; | |
30 for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack); | |
31 free(stack->stacks); | |
32 free(stack); | |
33 } | |
34 | |
35 static void gap_reset_stack(gap_stack_t *stack) | |
36 { | |
37 int i; | |
38 for (i = 0; i != stack->n_stacks; ++i) | |
39 stack->stacks[i].n_entries = 0; | |
40 stack->best = stack->n_stacks; | |
41 stack->n_entries = 0; | |
42 } | |
43 | |
44 static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, | |
45 int state, int is_diff, const gap_opt_t *opt) | |
46 { | |
47 int score; | |
48 gap_entry_t *p; | |
49 gap_stack1_t *q; | |
50 score = aln_score(n_mm, n_gapo, n_gape, opt); | |
51 q = stack->stacks + score; | |
52 if (q->n_entries == q->m_entries) { | |
53 q->m_entries = q->m_entries? q->m_entries<<1 : 4; | |
54 q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); | |
55 } | |
56 p = q->stack + q->n_entries; | |
57 p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; | |
58 p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; | |
59 p->last_diff_pos = is_diff? i : 0; | |
60 ++(q->n_entries); | |
61 ++(stack->n_entries); | |
62 if (stack->best > score) stack->best = score; | |
63 } | |
64 | |
65 static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e) | |
66 { | |
67 gap_stack1_t *q; | |
68 q = stack->stacks + stack->best; | |
69 *e = q->stack[q->n_entries - 1]; | |
70 --(q->n_entries); | |
71 --(stack->n_entries); | |
72 if (q->n_entries == 0 && stack->n_entries) { // reset best | |
73 int i; | |
74 for (i = stack->best + 1; i < stack->n_stacks; ++i) | |
75 if (stack->stacks[i].n_entries != 0) break; | |
76 stack->best = i; | |
77 } else if (stack->n_entries == 0) stack->best = stack->n_stacks; | |
78 } | |
79 | |
80 static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w) | |
81 { | |
82 int i, j; | |
83 for (i = j = 0; i < last_diff_pos; ++i) { | |
84 if (w[i].w > x) w[i].w -= x; | |
85 else if (w[i].w == x) { | |
86 w[i].bid = 1; | |
87 w[i].w = max - (++j); | |
88 } // else should not happen | |
89 } | |
90 } | |
91 | |
92 static inline int int_log2(uint32_t v) | |
93 { | |
94 int c = 0; | |
95 if (v & 0xffff0000u) { v >>= 16; c |= 16; } | |
96 if (v & 0xff00) { v >>= 8; c |= 8; } | |
97 if (v & 0xf0) { v >>= 4; c |= 4; } | |
98 if (v & 0xc) { v >>= 2; c |= 2; } | |
99 if (v & 0x2) c |= 1; | |
100 return c; | |
101 } | |
102 | |
103 bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, | |
104 bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) | |
105 { | |
106 int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); | |
107 int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; | |
108 int best_cnt = 0; | |
109 int max_entries = 0, j, _j, n_aln, m_aln; | |
110 bwt_aln1_t *aln; | |
111 | |
112 m_aln = 4; n_aln = 0; | |
113 aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); | |
114 | |
115 // check whether there are too many N | |
116 for (j = _j = 0; j < len; ++j) | |
117 if (seq[j] > 3) ++_j; | |
118 if (_j > max_diff) { | |
119 *_n_aln = n_aln; | |
120 return aln; | |
121 } | |
122 | |
123 //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); | |
124 gap_reset_stack(stack); // reset stack | |
125 gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); | |
126 | |
127 while (stack->n_entries) { | |
128 gap_entry_t e; | |
129 int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; | |
130 bwtint_t k, l, cnt_k[4], cnt_l[4], occ; | |
131 | |
132 if (max_entries < stack->n_entries) max_entries = stack->n_entries; | |
133 if (stack->n_entries > opt->max_entries) break; | |
134 gap_pop(stack, &e); // get the best entry | |
135 k = e.k; l = e.l; // SA interval | |
136 i = e.info&0xffff; // length | |
137 if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed | |
138 | |
139 m = max_diff - (e.n_mm + e.n_gapo); | |
140 if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; | |
141 if (m < 0) continue; | |
142 if (seed_width) { // apply seeding | |
143 m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); | |
144 if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; | |
145 } | |
146 //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); | |
147 if (i > 0 && m < width[i-1].bid) continue; | |
148 | |
149 // check whether a hit is found | |
150 hit_found = 0; | |
151 if (i == 0) hit_found = 1; | |
152 else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed | |
153 if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; | |
154 else continue; // no hit, skip | |
155 } | |
156 | |
157 if (hit_found) { // action for found hits | |
158 int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); | |
159 int do_add = 1; | |
160 //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); | |
161 if (n_aln == 0) { | |
162 best_score = score; | |
163 best_diff = e.n_mm + e.n_gapo; | |
164 if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape; | |
165 if (!(opt->mode & BWA_MODE_NONSTOP)) | |
166 max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour | |
167 } | |
168 if (score == best_score) best_cnt += l - k + 1; | |
169 else if (best_cnt > opt->max_top2) break; // top2b behaviour | |
170 if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat | |
171 for (j = 0; j != n_aln; ++j) | |
172 if (aln[j].k == k && aln[j].l == l) break; | |
173 if (j < n_aln) do_add = 0; | |
174 } | |
175 if (do_add) { // append | |
176 bwt_aln1_t *p; | |
177 gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); | |
178 if (n_aln == m_aln) { | |
179 m_aln <<= 1; | |
180 aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); | |
181 memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); | |
182 } | |
183 p = aln + n_aln; | |
184 p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; | |
185 p->k = k; p->l = l; | |
186 p->score = score; | |
187 ++n_aln; | |
188 } | |
189 continue; | |
190 } | |
191 | |
192 --i; | |
193 bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values | |
194 occ = l - k + 1; | |
195 // test whether diff is allowed | |
196 allow_diff = allow_M = 1; | |
197 if (i > 0) { | |
198 int ii = i - (len - opt->seed_len); | |
199 if (width[i-1].bid > m-1) allow_diff = 0; | |
200 else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; | |
201 if (seed_width && ii > 0) { | |
202 if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; | |
203 else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 | |
204 && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; | |
205 } | |
206 } | |
207 // indels | |
208 tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape; | |
209 if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) { | |
210 if (e.state == STATE_M) { // gap open | |
211 if (e.n_gapo < opt->max_gapo) { // gap open is allowed | |
212 // insertion | |
213 gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); | |
214 // deletion | |
215 for (j = 0; j != 4; ++j) { | |
216 k = bwt->L2[j] + cnt_k[j] + 1; | |
217 l = bwt->L2[j] + cnt_l[j]; | |
218 if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); | |
219 } | |
220 } | |
221 } else if (e.state == STATE_I) { // extention of an insertion | |
222 if (e.n_gape < opt->max_gape) // gap extention is allowed | |
223 gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); | |
224 } else if (e.state == STATE_D) { // extention of a deletion | |
225 if (e.n_gape < opt->max_gape) { // gap extention is allowed | |
226 if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { | |
227 for (j = 0; j != 4; ++j) { | |
228 k = bwt->L2[j] + cnt_k[j] + 1; | |
229 l = bwt->L2[j] + cnt_l[j]; | |
230 if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); | |
231 } | |
232 } | |
233 } | |
234 } | |
235 } | |
236 // mismatches | |
237 if (allow_diff && allow_M) { // mismatch is allowed | |
238 for (j = 1; j <= 4; ++j) { | |
239 int c = (seq[i] + j) & 3; | |
240 int is_mm = (j != 4 || seq[i] > 3); | |
241 k = bwt->L2[c] + cnt_k[c] + 1; | |
242 l = bwt->L2[c] + cnt_l[c]; | |
243 if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); | |
244 } | |
245 } else if (seq[i] < 4) { // try exact match only | |
246 int c = seq[i] & 3; | |
247 k = bwt->L2[c] + cnt_k[c] + 1; | |
248 l = bwt->L2[c] + cnt_l[c]; | |
249 if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); | |
250 } | |
251 } | |
252 | |
253 *_n_aln = n_aln; | |
254 //fprintf(stderr, "max_entries = %d\n", max_entries); | |
255 return aln; | |
256 } |