annotate bwa-0.6.2/bwtsw2_pair.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 #include <math.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2 #include <stdio.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 #include <stdlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4 #include <string.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 #include "bwt.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 #include "bntseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 #include "bwtsw2.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 #include "kstring.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 #ifndef _NO_SSE2
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10 #include "ksw.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 #else
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12 #include "stdaln.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 #endif
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15 #define MIN_RATIO 0.8
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 #define OUTLIER_BOUND 2.0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 #define MAX_STDDEV 4.0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18 #define EXT_STDDEV 4.0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 int low, high, failed;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 double avg, std;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 } bsw2pestat_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25 bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27 extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 int i, k, x, p25, p50, p75, tmp, max_len = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 uint64_t *isize;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30 bsw2pestat_t r;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 memset(&r, 0, sizeof(bsw2pestat_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 isize = calloc(n, 8);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 for (i = k = 0; i < n; i += 2) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 bsw2hit_t *t[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36 int l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42 if (l >= max_ins) continue; // skip pairs with excessively large insert
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45 isize[k++] = l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 ks_introsort_uint64_t(k, isize);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 p25 = isize[(int)(.25 * k + .499)];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 p50 = isize[(int)(.50 * k + .499)];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 p75 = isize[(int)(.75 * k + .499)];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 if (k < 8) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 ksprintf(msg, "[%s] fail to infer the insert size distribution.\n", __func__);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 free(isize);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 r.failed = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56 return r;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 r.low = tmp > max_len? tmp : max_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 if (r.low < 1) r.low = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 for (i = x = 0, r.avg = 0; i < k; ++i)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 if (isize[i] >= r.low && isize[i] <= r.high)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 r.avg += isize[i], ++x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 r.avg /= x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 for (i = 0, r.std = 0; i < k; ++i)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 if (isize[i] >= r.low && isize[i] <= r.high)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 r.std += (isize[i] - r.avg) * (isize[i] - r.avg);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 r.std = sqrt(r.std / x);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 tmp = (int)(p25 - 3. * (p75 - p25) + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 r.low = tmp > max_len? tmp : max_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 if (r.low < 1) r.low = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76 r.high = (int)(p75 + 3. * (p75 - p25) + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 r.low = tmp > max_len? tmp : max_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 free(isize);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 return r;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 int n_cigar, beg, end, len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87 int64_t pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88 uint32_t *cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89 } pairaux_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 extern unsigned char nst_nt4_table[256];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 extern void seq_reverse(int len, ubyte_t *seq, int is_comp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 int64_t k, beg, end;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97 uint8_t *seq, *ref;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 // compute the region start and end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 if (h->is_rev == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102 beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 if (beg < h->k) beg = h->k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 a->is_rev = 1; a->flag |= 16;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 a->is_rev = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 if (beg < 1) beg = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 if (end > l_pac) end = l_pac;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114 if (end - beg < l_mseq) return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115 // generate the sequence
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 seq = malloc(l_mseq + (end - beg));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 ref = seq + l_mseq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118 for (k = beg; k < end; ++k)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119 ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120 if (h->is_rev == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 for (i = 0; i < l_mseq; ++i) { // on the reverse strand
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122 int c = nst_nt4_table[(int)mseq[i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 for (i = 0; i < l_mseq; ++i) // on the forward strand
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 seq[i] = nst_nt4_table[(int)mseq[i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
128 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
129 #ifndef _NO_SSE2
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
130 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
131 ksw_query_t *q;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
132 ksw_aux_t aux[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
133 // forward Smith-Waterman
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
134 aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
135 q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
136 ksw_sse2(q, end - beg, ref, &aux[0]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
137 free(q);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
138 if (aux[0].score < opt->t) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
139 free(seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
140 return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
141 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
142 ++aux[0].qe; ++aux[0].te;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
143 // reverse Smith-Waterman
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
144 seq_reverse(aux[0].qe, seq, 0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
145 seq_reverse(aux[0].te, ref, 0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
146 q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
147 ksw_sse2(q, aux[0].te, ref, &aux[1]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
148 free(q);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
149 ++aux[1].qe; ++aux[1].te;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
150 // write output
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
151 a->G = aux[0].score;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
152 a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
153 if (a->G2 < opt->t) a->G2 = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
154 if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
155 a->k = beg + (aux[0].te - aux[1].te);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
156 a->len = aux[1].te;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
157 a->beg = aux[0].qe - aux[1].qe;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
158 a->end = aux[0].qe;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
159 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
160 #else
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
161 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
162 AlnParam ap;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
163 path_t path[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
164 int matrix[25];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
165 for (i = 0; i < 25; ++i) matrix[i] = g_mat[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
166 ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
167 ap.matrix = matrix; ap.row = 5; ap.band_width = 50;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
168 a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
169 if (a->G < opt->t) a->G = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
170 if (a->G2 < opt->t) a->G2 = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
171 a->k = beg + path[0].i - 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
172 a->len = path[1].i - path[0].i + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
173 a->beg = path[0].j - 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
174 a->end = path[1].j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
175 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
176 #endif
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
177 if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
178 free(seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
179 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
180
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
181 void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
182 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
183 extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
184 bsw2pestat_t pes;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
185 int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
186 int8_t g_mat[25];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
187 kstring_t msg;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
188 memset(&msg, 0, sizeof(kstring_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
189 pes = bsw2_stat(n, hits, &msg, opt->max_ins);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
190 for (i = k = 0; i < 5; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
191 for (j = 0; j < 4; ++j)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
192 g_mat[k++] = i == j? opt->a : -opt->b;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
193 g_mat[k++] = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
194 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
195 for (i = 0; i < n; i += 2) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
196 bsw2hit_t a[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
197 memset(&a, 0, sizeof(bsw2hit_t) * 2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
198 a[0].flag = 1<<6; a[1].flag = 1<<7;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
199 for (j = 0; j < 2; ++j) { // set the read1/2 flag
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
200 if (hits[i+j] == 0) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
201 for (k = 0; k < hits[i+j]->n; ++k) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
202 bsw2hit_t *p = &hits[i+j]->hits[k];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
203 p->flag |= 1<<(6+j);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
204 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
205 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
206 if (pes.failed) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
207 if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
208 if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
209 if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
210 if (!opt->skip_sw) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
211 if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
212 if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
213 } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
214 // the following enumerate all possibilities. It is tedious but necessary...
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
215 if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
216 bwtsw2_t *p[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
217 int which;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
218 if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
219 else p[0] = hits[i+1], p[1] = hits[i], which = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
220 if (a[which].G == 0) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
221 a[which].flag |= BSW2_FLAG_RESCUED;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
222 if (p[1]->max == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
223 p[1]->max = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
224 p[1]->hits = malloc(sizeof(bsw2hit_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
225 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
226 p[1]->hits[0] = a[which];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
227 p[1]->n = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
228 p[0]->hits[0].flag |= 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
229 p[1]->hits[0].flag |= 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
230 ++n_rescued;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
231 } else { // then both ends mapped
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
232 int is_fixed = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
233 //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
234 for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
235 bsw2hit_t *p = &hits[i+j]->hits[0];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
236 if (p->G < a[j].G) { // the orginal mapping is suboptimal
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
237 a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM?
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
238 *p = a[j];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
239 ++n_fixed;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
240 is_fixed = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
241 } else if (p->k != a[j].k && p->G2 < a[j].G) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
242 p->G2 = a[j].G;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
243 } else if (p->k == a[j].k && p->G2 < a[j].G2) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
244 p->G2 = a[j].G2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
245 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
246 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
247 if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
248 for (j = 0; j < 2; ++j)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
249 hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
250 } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
251 for (j = 0; j < 2; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
252 hits[i+j]->hits[0].flag |= 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
253 if (hits[i+j]->hits[0].k != a[j].k)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
254 hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
255 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
256 } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
257 if (a[0].G && a[1].G) { // now we have two "proper pairs"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
258 int G[2];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
259 double diff;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
260 G[0] = hits[i]->hits[0].G + a[1].G;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
261 G[1] = hits[i+1]->hits[0].G + a[0].G;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
262 diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
263 if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
264 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
265 if (a[0].G == 0 || a[1].G == 0) { // one proper pair only
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
266 bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
267 int which, isize;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
268 double dev, diff;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
269 if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
270 else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
271 isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
272 dev = fabs(isize - pes.avg) / pes.std;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
273 diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
274 if (diff < dev * 2.) { // then move (heuristic)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
275 a[which].G2 = a[which].G;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
276 p[1][0] = a[which];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
277 p[1]->flag |= BSW2_FLAG_MOVED | 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
278 p[0]->flag |= 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
279 ++n_moved;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
280 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
281 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
282 } else if (is_fixed) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
283 hits[i+0]->hits[0].flag |= 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
284 hits[i+1]->hits[0].flag |= 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
285 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
286 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
287 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
288 ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
289 fputs(msg.s, stderr);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
290 free(msg.s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
291 }