Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwape.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #include <unistd.h> | |
2 #include <math.h> | |
3 #include <stdlib.h> | |
4 #include <time.h> | |
5 #include <stdio.h> | |
6 #include <string.h> | |
7 #include "bwtaln.h" | |
8 #include "kvec.h" | |
9 #include "bntseq.h" | |
10 #include "utils.h" | |
11 #include "stdaln.h" | |
12 | |
13 typedef struct { | |
14 int n; | |
15 bwtint_t *a; | |
16 } poslist_t; | |
17 | |
18 typedef struct { | |
19 double avg, std; | |
20 bwtint_t low, high, high_bayesian; | |
21 } isize_info_t; | |
22 | |
23 #include "khash.h" | |
24 KHASH_MAP_INIT_INT64(64, poslist_t) | |
25 | |
26 #include "ksort.h" | |
27 KSORT_INIT_GENERIC(uint64_t) | |
28 | |
29 typedef struct { | |
30 kvec_t(uint64_t) arr; | |
31 kvec_t(uint64_t) pos[2]; | |
32 kvec_t(bwt_aln1_t) aln[2]; | |
33 } pe_data_t; | |
34 | |
35 #define MIN_HASH_WIDTH 1000 | |
36 | |
37 static int g_log_n[256]; | |
38 static kh_64_t *g_hash; | |
39 | |
40 void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); | |
41 void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); | |
42 void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); | |
43 int bwa_approx_mapQ(const bwa_seq_t *p, int mm); | |
44 void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); | |
45 bntseq_t *bwa_open_nt(const char *prefix); | |
46 void bwa_print_sam_SQ(const bntseq_t *bns); | |
47 | |
48 pe_opt_t *bwa_init_pe_opt() | |
49 { | |
50 pe_opt_t *po; | |
51 po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); | |
52 po->max_isize = 500; | |
53 po->max_occ = 100000; | |
54 po->n_multi = 3; | |
55 po->N_multi = 10; | |
56 po->type = BWA_PET_STD; | |
57 po->is_sw = 1; | |
58 po->ap_prior = 1e-5; | |
59 return po; | |
60 } | |
61 | |
62 static inline uint64_t hash_64(uint64_t key) | |
63 { | |
64 key += ~(key << 32); | |
65 key ^= (key >> 22); | |
66 key += ~(key << 13); | |
67 key ^= (key >> 8); | |
68 key += (key << 3); | |
69 key ^= (key >> 15); | |
70 key += ~(key << 27); | |
71 key ^= (key >> 31); | |
72 return key; | |
73 } | |
74 /* | |
75 static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); | |
76 { | |
77 const double a = 0.140012; | |
78 double b, c; | |
79 b = log(x * (2 - x)); | |
80 c = 2./M_PI/a + b / 2.; | |
81 return sqrt(sqrt(c * c - b / a) - c); | |
82 } | |
83 */ | |
84 | |
85 // for normal distribution, this is about 3std | |
86 #define OUTLIER_BOUND 2.0 | |
87 | |
88 static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L) | |
89 { | |
90 uint64_t x, *isizes; | |
91 int n, i, tot, p25, p75, p50, max_len = 1, tmp; | |
92 double skewness = 0.0, kurtosis = 0.0, y; | |
93 | |
94 ii->avg = ii->std = -1.0; | |
95 ii->low = ii->high = ii->high_bayesian = 0; | |
96 isizes = (uint64_t*)calloc(n_seqs, 8); | |
97 for (i = 0, tot = 0; i != n_seqs; ++i) { | |
98 bwa_seq_t *p[2]; | |
99 p[0] = seqs[0] + i; p[1] = seqs[1] + i; | |
100 if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) | |
101 isizes[tot++] = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos; | |
102 if (p[0]->len > max_len) max_len = p[0]->len; | |
103 if (p[1]->len > max_len) max_len = p[1]->len; | |
104 } | |
105 if (tot < 20) { | |
106 fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n"); | |
107 free(isizes); | |
108 return -1; | |
109 } | |
110 ks_introsort(uint64_t, tot, isizes); | |
111 p25 = isizes[(int)(tot*0.25 + 0.5)]; | |
112 p50 = isizes[(int)(tot*0.50 + 0.5)]; | |
113 p75 = isizes[(int)(tot*0.75 + 0.5)]; | |
114 tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); | |
115 ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned | |
116 ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); | |
117 for (i = 0, x = n = 0; i < tot; ++i) | |
118 if (isizes[i] >= ii->low && isizes[i] <= ii->high) | |
119 ++n, x += isizes[i]; | |
120 ii->avg = (double)x / n; | |
121 for (i = 0; i < tot; ++i) { | |
122 if (isizes[i] >= ii->low && isizes[i] <= ii->high) { | |
123 double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg); | |
124 ii->std += tmp; | |
125 skewness += tmp * (isizes[i] - ii->avg); | |
126 kurtosis += tmp * tmp; | |
127 } | |
128 } | |
129 kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3; | |
130 ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large | |
131 skewness = skewness / n / (ii->std * ii->std * ii->std); | |
132 free(isizes); | |
133 if (isnan(ii->std)) { | |
134 ii->low = ii->high = 0; ii->avg = ii->std = -1.0; | |
135 fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n"); | |
136 return -1; | |
137 } | |
138 for (y = 1.0; y < 10.0; y += 0.01) | |
139 if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; | |
140 ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); | |
141 fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75); | |
142 fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high); | |
143 fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); | |
144 fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf\n", skewness, kurtosis); | |
145 fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y); | |
146 return 0; | |
147 } | |
148 | |
149 static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) | |
150 { | |
151 int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; | |
152 uint64_t last_pos[2][2], o_pos[2], subo_score, o_score; | |
153 max_len = p[0]->full_len; | |
154 if (max_len < p[1]->full_len) max_len = p[1]->full_len; | |
155 if (low_bound < max_len) low_bound = max_len; | |
156 | |
157 // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize | |
158 #define __pairing_aux(u,v) do { \ | |
159 bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \ | |
160 if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \ | |
161 && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ | |
162 { \ | |
163 uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \ | |
164 s *= 10; \ | |
165 if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ | |
166 s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \ | |
167 if (s>>32 == o_score>>32) ++o_n; \ | |
168 else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ | |
169 else ++subo_n; \ | |
170 if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \ | |
171 else if (s < subo_score) subo_score = s; \ | |
172 } \ | |
173 } while (0) | |
174 | |
175 #define __pairing_aux2(q, w) do { \ | |
176 const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \ | |
177 (q)->extra_flag |= SAM_FPP; \ | |
178 if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \ | |
179 (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \ | |
180 (q)->score = r->score; (q)->mapQ = mapQ_p; \ | |
181 (q)->pos = (w)>>32; \ | |
182 if ((q)->mapQ > 0) ++cnt_chg; \ | |
183 } \ | |
184 } while (0) | |
185 | |
186 o_score = subo_score = (uint64_t)-1; | |
187 o_n = subo_n = 0; | |
188 ks_introsort(uint64_t, d->arr.n, d->arr.a); | |
189 for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1; | |
190 if (opt->type == BWA_PET_STD) { | |
191 for (i = 0; i < d->arr.n; ++i) { | |
192 uint64_t x = d->arr.a[i]; | |
193 int strand = d->aln[x&1].a[(uint32_t)x>>1].a; | |
194 if (strand == 1) { // reverse strand, then check | |
195 int y = 1 - (x&1); | |
196 __pairing_aux(last_pos[y][1], x); | |
197 __pairing_aux(last_pos[y][0], x); | |
198 } else { // forward strand, then push | |
199 last_pos[x&1][0] = last_pos[x&1][1]; | |
200 last_pos[x&1][1] = x; | |
201 } | |
202 } | |
203 } else if (opt->type == BWA_PET_SOLID) { | |
204 for (i = 0; i < d->arr.n; ++i) { | |
205 uint64_t x = d->arr.a[i]; | |
206 int strand = d->aln[x&1].a[(uint32_t)x>>1].a; | |
207 if ((strand^x)&1) { // push | |
208 int y = 1 - (x&1); | |
209 __pairing_aux(last_pos[y][1], x); | |
210 __pairing_aux(last_pos[y][0], x); | |
211 } else { // check | |
212 last_pos[x&1][0] = last_pos[x&1][1]; | |
213 last_pos[x&1][1] = x; | |
214 } | |
215 } | |
216 } else { | |
217 fprintf(stderr, "[paring] not implemented yet!\n"); | |
218 exit(1); | |
219 } | |
220 // set pairing | |
221 //fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); | |
222 if (o_score != (uint64_t)-1) { | |
223 int mapQ_p = 0; // this is the maximum mapping quality when one end is moved | |
224 //fprintf(stderr, "%d, %d\n", o_n, subo_n); | |
225 if (o_n == 1) { | |
226 if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair | |
227 else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair | |
228 else { | |
229 int n = subo_n > 255? 255 : subo_n; | |
230 mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n]; | |
231 if (mapQ_p < 0) mapQ_p = 0; | |
232 } | |
233 } | |
234 if (p[0]->pos == o_pos[0]>>32 && p[1]->pos == o_pos[1]>>32) { // both ends not moved | |
235 if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { | |
236 int mapQ = p[0]->mapQ + p[1]->mapQ; | |
237 if (mapQ > 60) mapQ = 60; | |
238 p[0]->mapQ = p[1]->mapQ = mapQ; | |
239 } else { | |
240 if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; | |
241 if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; | |
242 } | |
243 } else if (p[0]->pos == o_pos[0]>>32) { // [1] moved | |
244 p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; | |
245 if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; | |
246 } else if (p[1]->pos == o_pos[1]>>32) { // [0] moved | |
247 p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; | |
248 if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; | |
249 } else { // both ends moved | |
250 p[0]->seQ = p[1]->seQ = 0; | |
251 mapQ_p -= 20; | |
252 if (mapQ_p < 0) mapQ_p = 0; | |
253 p[0]->mapQ = p[1]->mapQ = mapQ_p; | |
254 } | |
255 __pairing_aux2(p[0], o_pos[0]); | |
256 __pairing_aux2(p[1], o_pos[1]); | |
257 } | |
258 return cnt_chg; | |
259 } | |
260 | |
261 typedef struct { | |
262 kvec_t(bwt_aln1_t) aln; | |
263 } aln_buf_t; | |
264 | |
265 int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, | |
266 const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) | |
267 { | |
268 int i, j, cnt_chg = 0; | |
269 char str[1024]; | |
270 bwt_t *bwt[2]; | |
271 pe_data_t *d; | |
272 aln_buf_t *buf[2]; | |
273 | |
274 d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); | |
275 buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); | |
276 buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); | |
277 | |
278 if (_bwt[0] == 0) { // load forward SA | |
279 strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); | |
280 strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); | |
281 strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); | |
282 strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); | |
283 } else bwt[0] = _bwt[0], bwt[1] = _bwt[1]; | |
284 | |
285 // SE | |
286 for (i = 0; i != n_seqs; ++i) { | |
287 bwa_seq_t *p[2]; | |
288 for (j = 0; j < 2; ++j) { | |
289 int n_aln; | |
290 p[j] = seqs[j] + i; | |
291 p[j]->n_multi = 0; | |
292 p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); | |
293 fread(&n_aln, 4, 1, fp_sa[j]); | |
294 if (n_aln > kv_max(d->aln[j])) | |
295 kv_resize(bwt_aln1_t, d->aln[j], n_aln); | |
296 d->aln[j].n = n_aln; | |
297 fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); | |
298 kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] | |
299 // generate SE alignment and mapping quality | |
300 bwa_aln2seq(n_aln, d->aln[j].a, p[j]); | |
301 if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { | |
302 int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; | |
303 p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa) | |
304 : bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len); | |
305 p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); | |
306 } | |
307 } | |
308 } | |
309 | |
310 // infer isize | |
311 infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len); | |
312 if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; | |
313 | |
314 // PE | |
315 for (i = 0; i != n_seqs; ++i) { | |
316 bwa_seq_t *p[2]; | |
317 for (j = 0; j < 2; ++j) { | |
318 p[j] = seqs[j] + i; | |
319 kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln); | |
320 } | |
321 if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) | |
322 && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) | |
323 { // only when both ends mapped | |
324 uint64_t x; | |
325 int j, k, n_occ[2]; | |
326 for (j = 0; j < 2; ++j) { | |
327 n_occ[j] = 0; | |
328 for (k = 0; k < d->aln[j].n; ++k) | |
329 n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1; | |
330 } | |
331 if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue; | |
332 d->arr.n = 0; | |
333 for (j = 0; j < 2; ++j) { | |
334 for (k = 0; k < d->aln[j].n; ++k) { | |
335 bwt_aln1_t *r = d->aln[j].a + k; | |
336 bwtint_t l; | |
337 if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table | |
338 uint64_t key = (uint64_t)r->k<<32 | r->l; | |
339 int ret; | |
340 khint_t iter = kh_put(64, g_hash, key, &ret); | |
341 if (ret) { // not in the hash table; ret must equal 1 as we never remove elements | |
342 poslist_t *z = &kh_val(g_hash, iter); | |
343 z->n = r->l - r->k + 1; | |
344 z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); | |
345 for (l = r->k; l <= r->l; ++l) | |
346 z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len); | |
347 } | |
348 for (l = 0; l < kh_val(g_hash, iter).n; ++l) { | |
349 x = kh_val(g_hash, iter).a[l]; | |
350 x = x<<32 | k<<1 | j; | |
351 kv_push(uint64_t, d->arr, x); | |
352 } | |
353 } else { // then calculate on the fly | |
354 for (l = r->k; l <= r->l; ++l) { | |
355 x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len); | |
356 x = x<<32 | k<<1 | j; | |
357 kv_push(uint64_t, d->arr, x); | |
358 } | |
359 } | |
360 } | |
361 } | |
362 cnt_chg += pairing(p, d, opt, gopt->s_mm, ii); | |
363 } | |
364 | |
365 if (opt->N_multi || opt->n_multi) { | |
366 for (j = 0; j < 2; ++j) { | |
367 if (p[j]->type != BWA_TYPE_NO_MATCH) { | |
368 int k; | |
369 if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { | |
370 bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); | |
371 } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); | |
372 for (k = 0; k < p[j]->n_multi; ++k) { | |
373 bwt_multi1_t *q = p[j]->multi + k; | |
374 q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len); | |
375 } | |
376 } | |
377 } | |
378 } | |
379 } | |
380 | |
381 // free | |
382 for (i = 0; i < n_seqs; ++i) { | |
383 kv_destroy(buf[0][i].aln); | |
384 kv_destroy(buf[1][i].aln); | |
385 } | |
386 free(buf[0]); free(buf[1]); | |
387 if (_bwt[0] == 0) { | |
388 bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); | |
389 } | |
390 kv_destroy(d->arr); | |
391 kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); | |
392 kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); | |
393 free(d); | |
394 return cnt_chg; | |
395 } | |
396 | |
397 #define SW_MIN_MATCH_LEN 20 | |
398 #define SW_MIN_MAPQ 17 | |
399 | |
400 // cnt = n_mm<<16 | n_gapo<<8 | n_gape | |
401 bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, | |
402 int *n_cigar, uint32_t *_cnt) | |
403 { | |
404 bwa_cigar_t *cigar = 0; | |
405 ubyte_t *ref_seq; | |
406 bwtint_t k, x, y, l; | |
407 int path_len, ret; | |
408 AlnParam ap = aln_param_bwa; | |
409 path_t *path, *p; | |
410 | |
411 // check whether there are too many N's | |
412 if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; | |
413 for (k = 0, x = 0; k < len; ++k) | |
414 if (seq[k] >= 4) ++x; | |
415 if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; | |
416 | |
417 // get reference subsequence | |
418 ref_seq = (ubyte_t*)calloc(reglen, 1); | |
419 for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) | |
420 ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; | |
421 path = (path_t*)calloc(l+len, sizeof(path_t)); | |
422 | |
423 // do alignment | |
424 ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0); | |
425 if (ret < 0) { | |
426 free(path); free(cigar); free(ref_seq); *n_cigar = 0; | |
427 return 0; | |
428 } | |
429 cigar = bwa_aln_path2cigar(path, path_len, n_cigar); | |
430 | |
431 // check whether the alignment is good enough | |
432 for (k = 0, x = y = 0; k < *n_cigar; ++k) { | |
433 bwa_cigar_t c = cigar[k]; | |
434 if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c); | |
435 else if (__cigar_op(c) == FROM_D) x += __cigar_len(c); | |
436 else y += __cigar_len(c); | |
437 } | |
438 if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough | |
439 free(path); free(cigar); free(ref_seq); | |
440 *n_cigar = 0; | |
441 return 0; | |
442 } | |
443 | |
444 { // update cigar and coordinate; | |
445 int start, end; | |
446 p = path + path_len - 1; | |
447 *beg += (p->i? p->i : 1) - 1; | |
448 start = (p->j? p->j : 1) - 1; | |
449 end = path->j; | |
450 cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); | |
451 if (start) { | |
452 memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); | |
453 cigar[0] = __cigar_create(3, start); | |
454 ++(*n_cigar); | |
455 } | |
456 if (end < len) { | |
457 /*cigar[*n_cigar] = 3<<14 | (len - end);*/ | |
458 cigar[*n_cigar] = __cigar_create(3, (len - end)); | |
459 ++(*n_cigar); | |
460 } | |
461 } | |
462 | |
463 { // set *cnt | |
464 int n_mm, n_gapo, n_gape; | |
465 n_mm = n_gapo = n_gape = 0; | |
466 p = path + path_len - 1; | |
467 x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; | |
468 for (k = 0; k < *n_cigar; ++k) { | |
469 bwa_cigar_t c = cigar[k]; | |
470 if (__cigar_op(c) == FROM_M) { | |
471 for (l = 0; l < (__cigar_len(c)); ++l) | |
472 if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm; | |
473 x += __cigar_len(c), y += __cigar_len(c); | |
474 } else if (__cigar_op(c) == FROM_D) { | |
475 x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; | |
476 } else if (__cigar_op(c) == FROM_I) { | |
477 y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; | |
478 } | |
479 } | |
480 *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; | |
481 } | |
482 | |
483 free(ref_seq); free(path); | |
484 return cigar; | |
485 } | |
486 | |
487 ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii) | |
488 { | |
489 ubyte_t *pacseq; | |
490 int i; | |
491 uint64_t n_tot[2], n_mapped[2]; | |
492 | |
493 // load reference sequence | |
494 if (_pacseq == 0) { | |
495 pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); | |
496 rewind(bns->fp_pac); | |
497 fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); | |
498 } else pacseq = (ubyte_t*)_pacseq; | |
499 if (!popt->is_sw || ii->avg < 0.0) return pacseq; | |
500 | |
501 // perform mate alignment | |
502 n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0; | |
503 for (i = 0; i != n_seqs; ++i) { | |
504 bwa_seq_t *p[2]; | |
505 p[0] = seqs[0] + i; p[1] = seqs[1] + i; | |
506 if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ | |
507 int k, n_cigar[2], is_singleton, mapQ = 0; | |
508 int64_t beg[2], end[2]; | |
509 bwa_cigar_t *cigar[2]; | |
510 uint32_t cnt[2]; | |
511 | |
512 /* In the following, _pref points to the reference read | |
513 * which must be aligned; _pmate points to its mate which is | |
514 * considered to be modified. */ | |
515 | |
516 #define __set_rght_coor(_a, _b, _pref, _pmate) do { \ | |
517 (_a) = _pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \ | |
518 (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ | |
519 if ((_a) < _pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \ | |
520 if ((_b) > bns->l_pac) (_b) = bns->l_pac; \ | |
521 } while (0) | |
522 | |
523 #define __set_left_coor(_a, _b, _pref, _pmate) do { \ | |
524 (_a) = _pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \ | |
525 (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ | |
526 if ((_a) < 0) (_a) = 0; \ | |
527 if ((_b) > _pref->pos) (_b) = _pref->pos; \ | |
528 } while (0) | |
529 | |
530 #define __set_fixed(_pref, _pmate, _beg, _cnt) do { \ | |
531 _pmate->type = BWA_TYPE_MATESW; \ | |
532 _pmate->pos = _beg; \ | |
533 _pmate->seQ = _pref->seQ; \ | |
534 _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \ | |
535 _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \ | |
536 _pmate->extra_flag |= SAM_FPP; \ | |
537 _pref->extra_flag |= SAM_FPP; \ | |
538 } while (0) | |
539 | |
540 is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0; | |
541 | |
542 ++n_tot[is_singleton]; | |
543 cigar[0] = cigar[1] = 0; | |
544 n_cigar[0] = n_cigar[1] = 0; | |
545 if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered | |
546 for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified | |
547 ubyte_t *seq; | |
548 if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip | |
549 if (popt->type == BWA_PET_STD) { | |
550 if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate | |
551 __set_rght_coor(beg[k], end[k], p[1-k], p[k]); | |
552 seq = p[k]->rseq; | |
553 } else { // then the mate is on forward stand and has smaller coordinate | |
554 __set_left_coor(beg[k], end[k], p[1-k], p[k]); | |
555 seq = p[k]->seq; | |
556 seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly | |
557 } | |
558 } else { // BWA_PET_SOLID | |
559 if (p[1-k]->strand == 0) { // R3-F3 pairing | |
560 if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 | |
561 else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 | |
562 seq = p[k]->rseq; | |
563 seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed | |
564 } else { // F3-R3 pairing | |
565 if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 | |
566 else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 | |
567 seq = p[k]->seq; | |
568 } | |
569 } | |
570 // perform SW alignment | |
571 cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); | |
572 // now revserse sequence back such that p[*]->seq looks untouched | |
573 if (popt->type == BWA_PET_STD) { | |
574 if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0); | |
575 } else { | |
576 if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0); | |
577 } | |
578 } | |
579 k = -1; // no read to be changed | |
580 if (cigar[0] && cigar[1]) { | |
581 k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed | |
582 mapQ = abs(p[1]->mapQ - p[0]->mapQ); | |
583 } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ; | |
584 else if (cigar[1]) k = 1, mapQ = p[0]->mapQ; | |
585 if (k >= 0 && p[k]->pos != beg[k]) { | |
586 ++n_mapped[is_singleton]; | |
587 { // recalculate mapping quality | |
588 int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8; | |
589 if (tmp <= 0) tmp = 1; | |
590 if (mapQ > tmp) mapQ = tmp; | |
591 p[k]->mapQ = p[1-k]->mapQ = mapQ; | |
592 p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ; | |
593 } | |
594 // update CIGAR | |
595 free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0; | |
596 p[k]->n_cigar = n_cigar[k]; | |
597 // update the rest of information | |
598 __set_fixed(p[1-k], p[k], beg[k], cnt[k]); | |
599 } | |
600 free(cigar[0]); free(cigar[1]); | |
601 } | |
602 } | |
603 fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n", | |
604 (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ); | |
605 fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n", | |
606 (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ); | |
607 return pacseq; | |
608 } | |
609 | |
610 void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) | |
611 { | |
612 int i, j, n_seqs, tot_seqs = 0; | |
613 bwa_seq_t *seqs[2]; | |
614 bwa_seqio_t *ks[2]; | |
615 clock_t t; | |
616 bntseq_t *bns, *ntbns = 0; | |
617 FILE *fp_sa[2]; | |
618 gap_opt_t opt; | |
619 khint_t iter; | |
620 isize_info_t last_ii; // this is for the last batch of reads | |
621 char str[1024]; | |
622 bwt_t *bwt[2]; | |
623 uint8_t *pac; | |
624 | |
625 // initialization | |
626 pac = 0; bwt[0] = bwt[1] = 0; | |
627 for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); | |
628 bns = bns_restore(prefix); | |
629 srand48(bns->seed); | |
630 for (i = 0; i < 2; ++i) { | |
631 ks[i] = bwa_seq_open(fn_fa[i]); | |
632 fp_sa[i] = xopen(fn_sa[i], "r"); | |
633 } | |
634 g_hash = kh_init(64); | |
635 last_ii.avg = -1.0; | |
636 | |
637 fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); | |
638 fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); | |
639 if (!(opt.mode & BWA_MODE_COMPREAD)) { | |
640 popt->type = BWA_PET_SOLID; | |
641 ntbns = bwa_open_nt(prefix); | |
642 } else { // for Illumina alignment only | |
643 if (popt->is_preload) { | |
644 strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); | |
645 strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); | |
646 strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); | |
647 strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); | |
648 pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); | |
649 rewind(bns->fp_pac); | |
650 fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); | |
651 } | |
652 } | |
653 | |
654 // core loop | |
655 bwa_print_sam_SQ(bns); | |
656 while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { | |
657 int cnt_chg; | |
658 isize_info_t ii; | |
659 ubyte_t *pacseq; | |
660 | |
661 seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual); | |
662 tot_seqs += n_seqs; | |
663 t = clock(); | |
664 | |
665 fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); | |
666 cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); | |
667 fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); | |
668 fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); | |
669 | |
670 fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n"); | |
671 pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii); | |
672 fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); | |
673 | |
674 fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); | |
675 for (j = 0; j < 2; ++j) | |
676 bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); | |
677 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); | |
678 if (pac == 0) free(pacseq); | |
679 | |
680 fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); | |
681 for (i = 0; i < n_seqs; ++i) { | |
682 bwa_print_sam1(bns, seqs[0] + i, seqs[1] + i, opt.mode, opt.max_top2); | |
683 bwa_print_sam1(bns, seqs[1] + i, seqs[0] + i, opt.mode, opt.max_top2); | |
684 } | |
685 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); | |
686 | |
687 for (j = 0; j < 2; ++j) | |
688 bwa_free_read_seq(n_seqs, seqs[j]); | |
689 fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs); | |
690 last_ii = ii; | |
691 } | |
692 | |
693 // destroy | |
694 bns_destroy(bns); | |
695 if (ntbns) bns_destroy(ntbns); | |
696 for (i = 0; i < 2; ++i) { | |
697 bwa_seq_close(ks[i]); | |
698 fclose(fp_sa[i]); | |
699 } | |
700 for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) | |
701 if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); | |
702 kh_destroy(64, g_hash); | |
703 if (pac) { | |
704 free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); | |
705 } | |
706 } | |
707 | |
708 int bwa_sai2sam_pe(int argc, char *argv[]) | |
709 { | |
710 int c; | |
711 pe_opt_t *popt; | |
712 popt = bwa_init_pe_opt(); | |
713 while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:")) >= 0) { | |
714 switch (c) { | |
715 case 'a': popt->max_isize = atoi(optarg); break; | |
716 case 'o': popt->max_occ = atoi(optarg); break; | |
717 case 's': popt->is_sw = 0; break; | |
718 case 'P': popt->is_preload = 1; break; | |
719 case 'n': popt->n_multi = atoi(optarg); break; | |
720 case 'N': popt->N_multi = atoi(optarg); break; | |
721 case 'c': popt->ap_prior = atof(optarg); break; | |
722 case 'f': freopen(optarg, "w", stdout); break; | |
723 default: return 1; | |
724 } | |
725 } | |
726 | |
727 if (optind + 5 > argc) { | |
728 fprintf(stderr, "\n"); | |
729 fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n"); | |
730 fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); | |
731 fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); | |
732 fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); | |
733 fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); | |
734 fprintf(stderr, " -c FLOAT prior of chimeric rate [%.1le]\n", popt->ap_prior); | |
735 fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); | |
736 fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); | |
737 fprintf(stderr, " -f FILE sam file to output results to instead of stdout\n\n"); | |
738 fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n"); | |
739 fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); | |
740 fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); | |
741 fprintf(stderr, "\n"); | |
742 return 1; | |
743 } | |
744 bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt); | |
745 free(popt); | |
746 return 0; | |
747 } |