annotate pyPRADA_1.2/tools/bwa-0.5.7-mh/cs2nt.c @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #include <string.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 #include <stdint.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #include <stdlib.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #include "bwtaln.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 #include "stdaln.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 /*
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 decode as ATTGAC(RBGOG), there are one color change and one nt change;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 if we decode as ATTAAC(RBRBG), there are two color changes.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 as the penalty; otherwise, we will use color quality as the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 penalty. This means we always prefer two consistent color changes over
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 a nt change, but if a color has high quality, we may prefer one nt
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 change.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 In the above example, the penalties of the two types of decoding are
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 otherwise the second. Note that no matter what we choose, the fourth
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 base will get a low nt quality.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 #define COLOR_MM 19
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 #define NUCL_MM 25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 /*
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 {A,C,G,T,N} -> {0,1,2,3,4}
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 btarray[0..4*size]: backtrack array (working space)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 int h[8], curr, last;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 int x, y, xmin, hmin, k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 // recursion: initial value
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 h[nt_ref[0]] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 // recursion: main loop
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 curr = 1; last = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 for (k = 1; k <= size; ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 for (x = 0; x != 4; ++x) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 int min = 0x7fffffff, ymin = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 for (y = 0; y != 4; ++y) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 int s = h[last<<2|y];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 if (s < min) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 min = s; ymin = y;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 last = curr; curr = 1 - curr; // swap
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 // back trace
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 hmin = 0x7fffffff; xmin = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 for (x = 0; x != 4; ++x) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 if (h[last<<2|x] < hmin) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 hmin = h[last<<2|x]; xmin = x;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 nt_read[size] = xmin;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 for (k = size - 1; k >= 0; --k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 /*
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 nt_read[0..size]: nucleotide read sequence: 0/1/2/3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 tarray[0..size*2-1]: temporary array
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 int k, c1, c2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 uint8_t *t2array = tarray + size;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 // get the color sequence of nt_read
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 c1 = nt_read[0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 for (k = 1; k <= size; ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 c1 = c2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 for (k = 1; k != size; ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 int q = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 } else if (tarray[k-1] == cs_read[k-1]>>6) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 } else if (tarray[k] == cs_read[k]>>6) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 } // else, q = 0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 if (q < 0) q = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 if (q > 60) q = 60;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 t2array[k] = nt_read[k]<<6 | q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 return t2array + 1; // of size-2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 // this function will be called when p->seq has been reversed by refine_gapped()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 int i, len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 uint8_t *seq;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 // set temporary arrays
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 if (p->type == BWA_TYPE_NO_MATCH) return;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 ta = (uint8_t*)malloc(len * 7);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 nt_ref = ta;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 cs_read = nt_ref + len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 nt_read = cs_read + len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 btarray = nt_read + len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 tarray = nt_read + len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 #define __gen_csbase(_cs, _i, _seq) do { \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if (q > 60) q = 60; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 if (_seq[_i] > 3) q = 63; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 (_cs) = _seq[_i]<<6 | q; \
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 } while (0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 // generate len, nt_ref[] and cs_read
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 seq = p->strand? p->rseq : p->seq;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 if (p->cigar == 0) { // no gap or clipping
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 len = p->len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 for (i = 0; i < p->len; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 __gen_csbase(cs_read[i], i, seq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 nt_ref[i+1] = bns_pac(pac, p->pos + i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 int k, z;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 bwtint_t x, y;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 x = p->pos; y = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 for (k = z = 0; k < p->n_cigar; ++k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 int l = __cigar_len(p->cigar[k]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 if (__cigar_op(p->cigar[k]) == FROM_M) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 for (i = 0; i < l; ++i, ++x, ++y) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 __gen_csbase(cs_read[z], y, seq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 nt_ref[z+1] = bns_pac(pac, x);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 ++z;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 } else if (__cigar_op(p->cigar[k]) == FROM_I) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 for (i = 0; i < l; ++i, ++y) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 __gen_csbase(cs_read[z], y, seq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 nt_ref[z+1] = 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 ++z;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 else x += l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 len = z;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 // update p
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 p->len = p->full_len = len - 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 for (i = 0; i < p->len; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 if ((new_nt_read[i]&0x3f) == 63) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 p->qual[i] = 33; seq[i] = 4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 p->qual[i] = (new_nt_read[i]&0x3f) + 33;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 seq[i] = new_nt_read[i]>>6;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 p->qual[p->len] = seq[p->len] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 if (p->strand) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 memcpy(p->seq, seq, p->len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 seq_reverse(p->len, p->seq, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 seq_reverse(p->len, p->qual, 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 memcpy(p->rseq, seq, p->len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 seq_reverse(p->len, p->rseq, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 free(ta);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 }