comparison bwa-0.6.2/stdaln.c @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dd1186b11b3b
1 /* The MIT License
2
3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdint.h>
30 #include "stdaln.h"
31
32 /* char -> 17 (=16+1) nucleotides */
33 unsigned char aln_nt16_table[256] = {
34 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
35 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
36 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15,
37 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
38 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15,
39 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15,
40 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15,
41 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15,
42 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
43 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
44 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
45 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
46 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
47 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
48 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
49 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
50 };
51 char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-";
52
53 /* char -> 5 (=4+1) nucleotides */
54 unsigned char aln_nt4_table[256] = {
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
58 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4,
62 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
63 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
64 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
65 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
66 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
67 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
68 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
69 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
70 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
71 };
72 char *aln_nt4_rev_table = "AGCTN-";
73
74 /* char -> 22 (=20+1+1) amino acids */
75 unsigned char aln_aa_table[256] = {
76 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
77 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
78 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21,
79 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
80 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21,
81 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21,
82 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21,
83 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21,
84 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
85 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
86 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
87 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
88 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
89 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
90 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21,
91 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21
92 };
93 char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-";
94 /* 01234567890123456789012 */
95
96 /* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */
97 unsigned char aln_trans_table_eu[66] = {
98 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9,
99 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19,
100 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10,
101 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22
102 };
103 char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX";
104 /* 01234567890123456789012345678901234567890123456789012345678901234 */
105 int aln_sm_blosum62[] = {
106 /* A R N D C Q E G H I L K M F P S T W Y V * X */
107 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
108 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
109 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
110 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
111 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
112 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
113 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
114 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
115 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
116 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
117 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
118 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
119 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
120 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
121 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
122 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
123 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
124 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
125 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
126 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
127 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
128 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
129 };
130
131 int aln_sm_blosum45[] = {
132 /* A R N D C Q E G H I L K M F P S T W Y V * X */
133 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0,
134 -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1,
135 -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1,
136 -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1,
137 -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2,
138 -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1,
139 -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1,
140 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1,
141 -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1,
142 -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1,
143 -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1,
144 -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1,
145 -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1,
146 -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1,
147 -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1,
148 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0,
149 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0,
150 -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2,
151 -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1,
152 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1,
153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5,
154 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1
155 };
156
157 int aln_sm_nt[] = {
158 /* X A G R C M S V T W K D Y H B N */
159 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
160 -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0,
161 -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0,
162 -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0,
163 -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0,
164 -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0,
165 -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0,
166 -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0,
167 -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0,
168 -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0,
169 -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0,
170 -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0,
172 -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173 -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174 -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
175 };
176
177 int aln_sm_read[] = {
178 /* X A G R C M S V T W K D Y H B N */
179 -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,
180 -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0,
181 -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0,
182 -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0,
183 -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0,
184 -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0,
185 -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0,
186 -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0,
187 -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0,
188 -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0,
189 -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0,
190 -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0,
192 -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
195 };
196
197 int aln_sm_hs[] = {
198 /* A G C T N */
199 91, -31,-114,-123, -44,
200 -31, 100,-125,-114, -42,
201 -123,-125, 100, -31, -42,
202 -114,-114, -31, 91, -42,
203 -44, -42, -42, -42, -43
204 };
205
206 int aln_sm_maq[] = {
207 11, -19, -19, -19, -13,
208 -19, 11, -19, -19, -13,
209 -19, -19, 11, -19, -13,
210 -19, -19, -19, 11, -13,
211 -13, -13, -13, -13, -13
212 };
213
214 int aln_sm_blast[] = {
215 1, -3, -3, -3, -2,
216 -3, 1, -3, -3, -2,
217 -3, -3, 1, -3, -2,
218 -3, -3, -3, 1, -2,
219 -2, -2, -2, -2, -2
220 };
221
222 /********************/
223 /* START OF align.c */
224 /********************/
225
226 AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };
227 AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 };
228 AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 };
229 AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 };
230 AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };
231
232 AlnAln *aln_init_AlnAln()
233 {
234 AlnAln *aa;
235 aa = (AlnAln*)malloc(sizeof(AlnAln));
236 aa->path = 0;
237 aa->out1 = aa->out2 = aa->outm = 0;
238 aa->path_len = 0;
239 return aa;
240 }
241 void aln_free_AlnAln(AlnAln *aa)
242 {
243 free(aa->path); free(aa->cigar32);
244 free(aa->out1); free(aa->out2); free(aa->outm);
245 free(aa);
246 }
247
248 /***************************/
249 /* START OF common_align.c */
250 /***************************/
251
252 #define LOCAL_OVERFLOW_THRESHOLD 32000
253 #define LOCAL_OVERFLOW_REDUCE 16000
254 #define NT_LOCAL_SCORE int
255 #define NT_LOCAL_SHIFT 16
256 #define NT_LOCAL_MASK 0xffff
257
258 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
259
260 #define set_M(MM, cur, p, sc) \
261 { \
262 if ((p)->M >= (p)->I) { \
263 if ((p)->M >= (p)->D) { \
264 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
265 } else { \
266 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
267 } \
268 } else { \
269 if ((p)->I > (p)->D) { \
270 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
271 } else { \
272 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
273 } \
274 } \
275 }
276 #define set_I(II, cur, p) \
277 { \
278 if ((p)->M - gap_open > (p)->I) { \
279 (cur)->It = FROM_M; \
280 (II) = (p)->M - gap_open - gap_ext; \
281 } else { \
282 (cur)->It = FROM_I; \
283 (II) = (p)->I - gap_ext; \
284 } \
285 }
286 #define set_end_I(II, cur, p) \
287 { \
288 if (gap_end >= 0) { \
289 if ((p)->M - gap_open > (p)->I) { \
290 (cur)->It = FROM_M; \
291 (II) = (p)->M - gap_open - gap_end; \
292 } else { \
293 (cur)->It = FROM_I; \
294 (II) = (p)->I - gap_end; \
295 } \
296 } else set_I(II, cur, p); \
297 }
298 #define set_D(DD, cur, p) \
299 { \
300 if ((p)->M - gap_open > (p)->D) { \
301 (cur)->Dt = FROM_M; \
302 (DD) = (p)->M - gap_open - gap_ext; \
303 } else { \
304 (cur)->Dt = FROM_D; \
305 (DD) = (p)->D - gap_ext; \
306 } \
307 }
308 #define set_end_D(DD, cur, p) \
309 { \
310 if (gap_end >= 0) { \
311 if ((p)->M - gap_open > (p)->D) { \
312 (cur)->Dt = FROM_M; \
313 (DD) = (p)->M - gap_open - gap_end; \
314 } else { \
315 (cur)->Dt = FROM_D; \
316 (DD) = (p)->D - gap_end; \
317 } \
318 } else set_D(DD, cur, p); \
319 }
320
321 typedef struct
322 {
323 unsigned char Mt:3, It:2, Dt:2;
324 } dpcell_t;
325
326 typedef struct
327 {
328 int M, I, D;
329 } dpscore_t;
330
331 /* build score profile for accelerating alignment, in theory */
332 void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array)
333 {
334 int *tmp, *tmp2, i, k;
335 for (i = 0; i != row; ++i) {
336 tmp = score_matrix + i * row;
337 tmp2 = s_array[i];
338 for (k = 0; k != len; ++k)
339 tmp2[k] = tmp[seq[k]];
340 }
341 }
342 /***************************
343 * banded global alignment *
344 ***************************/
345 int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
346 path_t *path, int *path_len)
347 {
348 register int i, j;
349 dpcell_t **dpcell, *q;
350 dpscore_t *curr, *last, *s;
351 path_t *p;
352 int b1, b2, tmp_end;
353 int *mat, end, max;
354 unsigned char type, ctype;
355
356 int gap_open, gap_ext, gap_end, b;
357 int *score_matrix, N_MATRIX_ROW;
358
359 /* initialize some align-related parameters. just for compatibility */
360 gap_open = ap->gap_open;
361 gap_ext = ap->gap_ext;
362 gap_end = ap->gap_end;
363 b = ap->band_width;
364 score_matrix = ap->matrix;
365 N_MATRIX_ROW = ap->row;
366
367 if (len1 == 0 || len2 == 0) {
368 *path_len = 0;
369 return 0;
370 }
371 /* calculate b1 and b2 */
372 if (len1 > len2) {
373 b1 = len1 - len2 + b;
374 b2 = b;
375 } else {
376 b1 = b;
377 b2 = len2 - len1 + b;
378 }
379 if (b1 > len1) b1 = len1;
380 if (b2 > len2) b2 = len2;
381 --seq1; --seq2;
382
383 /* allocate memory */
384 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
385 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
386 for (j = 0; j <= len2; ++j)
387 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
388 for (j = b2 + 1; j <= len2; ++j)
389 dpcell[j] -= j - b2;
390 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
391 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
392
393 /* set first row */
394 SET_INF(*curr); curr->M = 0;
395 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
396 SET_INF(*s);
397 set_end_D(s->D, dpcell[0] + i, s - 1);
398 }
399 s = curr; curr = last; last = s;
400
401 /* core dynamic programming, part 1 */
402 tmp_end = (b2 < len2)? b2 : len2 - 1;
403 for (j = 1; j <= tmp_end; ++j) {
404 q = dpcell[j]; s = curr; SET_INF(*s);
405 set_end_I(s->I, q, last);
406 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
407 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
408 ++s; ++q;
409 for (i = 1; i != end; ++i, ++s, ++q) {
410 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
411 set_I(s->I, q, last + i);
412 set_D(s->D, q, s - 1);
413 }
414 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
415 set_D(s->D, q, s - 1);
416 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
417 set_end_I(s->I, q, last + i);
418 } else s->I = MINOR_INF;
419 s = curr; curr = last; last = s;
420 }
421 /* last row for part 1, use set_end_D() instead of set_D() */
422 if (j == len2 && b2 != len2 - 1) {
423 q = dpcell[j]; s = curr; SET_INF(*s);
424 set_end_I(s->I, q, last);
425 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
426 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
427 ++s; ++q;
428 for (i = 1; i != end; ++i, ++s, ++q) {
429 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
430 set_I(s->I, q, last + i);
431 set_end_D(s->D, q, s - 1);
432 }
433 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
434 set_end_D(s->D, q, s - 1);
435 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
436 set_end_I(s->I, q, last + i);
437 } else s->I = MINOR_INF;
438 s = curr; curr = last; last = s;
439 ++j;
440 }
441
442 /* core dynamic programming, part 2 */
443 for (; j <= len2 - b2 + 1; ++j) {
444 SET_INF(curr[j - b2]);
445 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
446 end = j + b1 - 1;
447 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
448 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
449 set_I(s->I, q, last + i);
450 set_D(s->D, q, s - 1);
451 }
452 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
453 set_D(s->D, q, s - 1);
454 s->I = MINOR_INF;
455 s = curr; curr = last; last = s;
456 }
457
458 /* core dynamic programming, part 3 */
459 for (; j < len2; ++j) {
460 SET_INF(curr[j - b2]);
461 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
462 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
463 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
464 set_I(s->I, q, last + i);
465 set_D(s->D, q, s - 1);
466 }
467 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
468 set_end_I(s->I, q, last + i);
469 set_D(s->D, q, s - 1);
470 s = curr; curr = last; last = s;
471 }
472 /* last row */
473 if (j == len2) {
474 SET_INF(curr[j - b2]);
475 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
476 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
477 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
478 set_I(s->I, q, last + i);
479 set_end_D(s->D, q, s - 1);
480 }
481 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
482 set_end_I(s->I, q, last + i);
483 set_end_D(s->D, q, s - 1);
484 s = curr; curr = last; last = s;
485 }
486
487 /* backtrace */
488 i = len1; j = len2;
489 q = dpcell[j] + i;
490 s = last + len1;
491 max = s->M; type = q->Mt; ctype = FROM_M;
492 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
493 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
494
495 p = path;
496 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
497 ++p;
498 do {
499 switch (ctype) {
500 case FROM_M: --i; --j; break;
501 case FROM_I: --j; break;
502 case FROM_D: --i; break;
503 }
504 q = dpcell[j] + i;
505 ctype = type;
506 switch (type) {
507 case FROM_M: type = q->Mt; break;
508 case FROM_I: type = q->It; break;
509 case FROM_D: type = q->Dt; break;
510 }
511 p->ctype = ctype; p->i = i; p->j = j;
512 ++p;
513 } while (i || j);
514 *path_len = p - path - 1;
515
516 /* free memory */
517 for (j = b2 + 1; j <= len2; ++j)
518 dpcell[j] += j - b2;
519 for (j = 0; j <= len2; ++j)
520 free(dpcell[j]);
521 free(dpcell);
522 free(curr); free(last);
523
524 return max;
525 }
526 /*************************************************
527 * local alignment combined with banded strategy *
528 *************************************************/
529 int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
530 path_t *path, int *path_len, int _thres, int *_subo)
531 {
532 register NT_LOCAL_SCORE *s;
533 register int i;
534 int q, r, qr, tmp_len, qr_shift;
535 int **s_array, *score_array;
536 int e, f;
537 int is_overflow, of_base;
538 NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h;
539 int j, start_i, start_j, end_i, end_j;
540 path_t *p;
541 int score_f, score_r, score_g;
542 int start, end, max_score;
543 int thres, *suba, *ss;
544
545 int gap_open, gap_ext, b;
546 int *score_matrix, N_MATRIX_ROW;
547
548 /* initialize some align-related parameters. just for compatibility */
549 gap_open = ap->gap_open;
550 gap_ext = ap->gap_ext;
551 b = ap->band_width;
552 score_matrix = ap->matrix;
553 N_MATRIX_ROW = ap->row;
554 thres = _thres > 0? _thres : -_thres;
555
556 if (len1 == 0 || len2 == 0) return -1;
557
558 /* allocate memory */
559 suba = (int*)malloc(sizeof(int) * (len2 + 1));
560 eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1));
561 s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW);
562 for (i = 0; i != N_MATRIX_ROW; ++i)
563 s_array[i] = (int*)malloc(sizeof(int) * len1);
564 /* initialization */
565 aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
566 q = gap_open;
567 r = gap_ext;
568 qr = q + r;
569 qr_shift = (qr+1) << NT_LOCAL_SHIFT;
570 tmp_len = len1 + 1;
571 start_i = start_j = end_i = end_j = 0;
572 for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i)
573 if (max_score < score_matrix[i]) max_score = score_matrix[i];
574 /* convert the coordinate */
575 --seq1; --seq2;
576 for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i];
577
578 /* forward dynamic programming */
579 for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0;
580 score_f = 0;
581 is_overflow = of_base = 0;
582 suba[0] = 0;
583 for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) {
584 int subo = 0;
585 last_h = f = 0;
586 score_array = s_array[seq2[j]];
587 if (is_overflow) { /* adjust eh[] array if overflow occurs. */
588 /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed.
589 * If it is too large, this block will be excuted frequently and therefore
590 * slow down the whole program.
591 * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the
592 * number of assignments because it sets some cells to zero when overflow
593 * happens. */
594 int tmp, tmp2;
595 score_f -= LOCAL_OVERFLOW_REDUCE;
596 of_base += LOCAL_OVERFLOW_REDUCE;
597 is_overflow = 0;
598 for (i = 1, s = eh; i <= tmp_len; ++i, ++s) {
599 tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK;
600 if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
601 else tmp2 -= LOCAL_OVERFLOW_REDUCE;
602 if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
603 else tmp -= LOCAL_OVERFLOW_REDUCE;
604 *s = (tmp << NT_LOCAL_SHIFT) | tmp2;
605 }
606 }
607 for (i = 1, s = eh; i != tmp_len; ++i, ++s) {
608 /* prepare for calculate current h */
609 curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i];
610 if (curr_h < 0) curr_h = 0;
611 if (last_h > 0) { /* initialize f */
612 f = (f > last_h - q)? f - r : last_h - qr;
613 if (curr_h < f) curr_h = f;
614 }
615 if (*(s+1) >= qr_shift) { /* initialize e */
616 curr_last_h = *(s+1) >> NT_LOCAL_SHIFT;
617 e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr;
618 if (curr_h < e) curr_h = e;
619 *s = (last_h << NT_LOCAL_SHIFT) | e;
620 } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */
621 last_h = curr_h;
622 if (subo < curr_h) subo = curr_h;
623 if (score_f < curr_h) {
624 score_f = curr_h; end_i = i; end_j = j;
625 if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
626 }
627 }
628 *s = last_h << NT_LOCAL_SHIFT;
629 *ss = subo + of_base;
630 }
631 score_f += of_base;
632
633 if (score_f < thres) { /* no matching residue at all, 090218 */
634 if (path_len) *path_len = 0;
635 goto end_func;
636 }
637 if (path == 0) goto end_func; /* skip path-filling */
638
639 /* reverse dynamic programming */
640 for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0;
641 if (end_i == 0 || end_j == 0) goto end_func; /* no local match */
642 score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]];
643 is_overflow = of_base = 0;
644 start_i = end_i; start_j = end_j;
645 eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */
646 start = end_i - 1;
647 end = end_i - 3;
648 if (end <= 0) end = 0;
649
650 /* second pass DP can be done in a band, speed will thus be enhanced */
651 for (j = end_j - 1; j != 0; --j) {
652 last_h = f = 0;
653 score_array = s_array[seq2[j]];
654 if (is_overflow) { /* adjust eh[] array if overflow occurs. */
655 int tmp, tmp2;
656 score_r -= LOCAL_OVERFLOW_REDUCE;
657 of_base += LOCAL_OVERFLOW_REDUCE;
658 is_overflow = 0;
659 for (i = start, s = eh + start + 1; i >= end; --i, --s) {
660 tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK;
661 if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
662 else tmp2 -= LOCAL_OVERFLOW_REDUCE;
663 if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
664 else tmp -= LOCAL_OVERFLOW_REDUCE;
665 *s = (tmp << NT_LOCAL_SHIFT) | tmp2;
666 }
667 }
668 for (i = start, s = eh + start + 1; i != end; --i, --s) {
669 /* prepare for calculate current h */
670 curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i];
671 if (curr_h < 0) curr_h = 0;
672 if (last_h > 0) { /* initialize f */
673 f = (f > last_h - q)? f - r : last_h - qr;
674 if (curr_h < f) curr_h = f;
675 }
676 curr_last_h = *(s-1) >> NT_LOCAL_SHIFT;
677 e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr;
678 if (e < 0) e = 0;
679 if (curr_h < e) curr_h = e;
680 *s = (last_h << NT_LOCAL_SHIFT) | e;
681 last_h = curr_h;
682 if (score_r < curr_h) {
683 score_r = curr_h; start_i = i; start_j = j;
684 if (score_r + of_base - qr == score_f) {
685 j = 1; break;
686 }
687 if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
688 }
689 }
690 *s = last_h << NT_LOCAL_SHIFT;
691 /* recalculate start and end, the boundaries of the band */
692 if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start;
693 if (start <= 0) start = 0;
694 end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1;
695 if (end <= 0) end = 0;
696 }
697
698 if (_subo) {
699 int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499);
700 for (j = 1; j <= tmp; ++j)
701 if (tmp2 < suba[j]) tmp2 = suba[j];
702 tmp = (int)(end_j + .33 * (end_j - start_j) + .499);
703 for (j = tmp; j <= len2; ++j)
704 if (tmp2 < suba[j]) tmp2 = suba[j];
705 *_subo = tmp2;
706 }
707
708 if (path_len == 0) {
709 path[0].i = start_i; path[0].j = start_j;
710 path[1].i = end_i; path[1].j = end_j;
711 goto end_func;
712 }
713
714 score_r += of_base;
715 score_r -= qr;
716
717 #ifdef DEBUG
718 /* this seems not a bug */
719 if (score_f != score_r)
720 fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r);
721 #endif
722
723 if (_thres > 0) { /* call global alignment to fill the path */
724 score_g = 0;
725 j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j;
726 ++j; /* j is the maximum band_width */
727 for (i = ap->band_width;; i <<= 1) {
728 AlnParam ap_real = *ap;
729 ap_real.gap_end = -1;
730 ap_real.band_width = i;
731 score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j,
732 end_j - start_j + 1, &ap_real, path, path_len);
733 if (score_g == score_r || score_f == score_g) break;
734 if (i > j) break;
735 }
736 if (score_r > score_g && score_f > score_g) {
737 fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g);
738 score_f = score_r = -1;
739 } else score_f = score_g;
740
741 /* convert coordinate */
742 for (p = path + *path_len - 1; p >= path; --p) {
743 p->i += start_i - 1;
744 p->j += start_j - 1;
745 }
746 } else { /* just store the start and end */
747 *path_len = 2;
748 path[1].i = start_i; path[1].j = start_j;
749 path->i = end_i; path->j = end_j;
750 }
751
752 end_func:
753 /* free */
754 free(eh); free(suba);
755 for (i = 0; i != N_MATRIX_ROW; ++i) {
756 ++s_array[i];
757 free(s_array[i]);
758 }
759 free(s_array);
760 return score_f;
761 }
762 AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap,
763 int type, int thres, int len1, int len2)
764 {
765 unsigned char *seq11, *seq22;
766 int score;
767 int i, j, l;
768 path_t *p;
769 char *out1, *out2, *outm;
770 AlnAln *aa;
771
772 if (len1 < 0) len1 = strlen(seq1);
773 if (len2 < 0) len2 = strlen(seq2);
774
775 aa = aln_init_AlnAln();
776 seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1);
777 seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2);
778 aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1));
779
780 if (ap->row < 10) { /* 4-nucleotide alignment */
781 for (i = 0; i < len1; ++i)
782 seq11[i] = aln_nt4_table[(int)seq1[i]];
783 for (j = 0; j < len2; ++j)
784 seq22[j] = aln_nt4_table[(int)seq2[j]];
785 } else if (ap->row < 20) { /* 16-nucleotide alignment */
786 for (i = 0; i < len1; ++i)
787 seq11[i] = aln_nt16_table[(int)seq1[i]];
788 for (j = 0; j < len2; ++j)
789 seq22[j] = aln_nt16_table[(int)seq2[j]];
790 } else { /* amino acids */
791 for (i = 0; i < len1; ++i)
792 seq11[i] = aln_aa_table[(int)seq1[i]];
793 for (j = 0; j < len2; ++j)
794 seq22[j] = aln_aa_table[(int)seq2[j]];
795 }
796
797 if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len);
798 else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo);
799 else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0);
800 else {
801 free(seq11); free(seq22); free(aa->path);
802 aln_free_AlnAln(aa);
803 return 0;
804 }
805 aa->score = score;
806
807 if (thres > 0) {
808 out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1));
809 out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1));
810 outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1));
811
812 --seq1; --seq2;
813 --seq11; --seq22;
814
815 p = aa->path + aa->path_len - 1;
816
817 for (l = 0; p >= aa->path; --p, ++l) {
818 switch (p->ctype) {
819 case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j];
820 outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' ';
821 break;
822 case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break;
823 case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break;
824 }
825 }
826 out1[l] = out2[l] = outm[l] = '\0';
827 ++seq11; ++seq22;
828 }
829
830 free(seq11);
831 free(seq22);
832
833 p = aa->path + aa->path_len - 1;
834 aa->start1 = p->i? p->i : 1;
835 aa->end1 = aa->path->i;
836 aa->start2 = p->j? p->j : 1;
837 aa->end2 = aa->path->j;
838 aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar);
839
840 return aa;
841 }
842 AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres)
843 {
844 return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1);
845 }
846
847 /* for backward compatibility */
848 uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
849 {
850 uint32_t *cigar32;
851 uint16_t *cigar;
852 int i;
853 cigar32 = aln_path2cigar32(path, path_len, n_cigar);
854 cigar = (uint16_t*)cigar32;
855 for (i = 0; i < *n_cigar; ++i)
856 cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff);
857 return cigar;
858 }
859
860 /* newly added functions (2009-07-21) */
861
862 int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
863 path_t *path, int *path_len, int G0, uint8_t *_mem)
864 {
865 int q, r, qr, tmp_len;
866 int32_t **s_array, *score_array;
867 int is_overflow, of_base;
868 uint32_t *eh;
869 int i, j, end_i, end_j;
870 int score, start, end;
871 int *score_matrix, N_MATRIX_ROW;
872 uint8_t *mem, *_p;
873
874 /* initialize some align-related parameters. just for compatibility */
875 q = ap->gap_open;
876 r = ap->gap_ext;
877 qr = q + r;
878 score_matrix = ap->matrix;
879 N_MATRIX_ROW = ap->row;
880
881 if (len1 == 0 || len2 == 0) return -1;
882
883 /* allocate memory */
884 mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4);
885 _p = mem;
886 eh = (uint32_t*)_p, _p += 4 * (len1 + 2);
887 s_array = calloc(N_MATRIX_ROW, sizeof(void*));
888 for (i = 0; i != N_MATRIX_ROW; ++i)
889 s_array[i] = (int32_t*)_p, _p += 4 * len1;
890 /* initialization */
891 aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
892 tmp_len = len1 + 1;
893 start = 1; end = 2;
894 end_i = end_j = 0;
895 score = 0;
896 is_overflow = of_base = 0;
897 /* convert the coordinate */
898 --seq1; --seq2;
899 for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i];
900
901 /* dynamic programming */
902 memset(eh, 0, 4 * (len1 + 2));
903 eh[1] = (uint32_t)G0<<16;
904 for (j = 1; j <= len2; ++j) {
905 int _start, _end;
906 int h1 = 0, f = 0;
907 score_array = s_array[seq2[j]];
908 /* set start and end */
909 _start = j - ap->band_width;
910 if (_start < 1) _start = 1;
911 if (_start > start) start = _start;
912 _end = j + ap->band_width;
913 if (_end > len1 + 1) _end = len1 + 1;
914 if (_end < end) end = _end;
915 if (start == end) break;
916 /* adjust eh[] array if overflow occurs. */
917 if (is_overflow) {
918 int tmp, tmp2;
919 score -= LOCAL_OVERFLOW_REDUCE;
920 of_base += LOCAL_OVERFLOW_REDUCE;
921 is_overflow = 0;
922 for (i = start; i <= end; ++i) {
923 uint32_t *s = &eh[i];
924 tmp = *s >> 16; tmp2 = *s & 0xffff;
925 if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0;
926 else tmp2 -= LOCAL_OVERFLOW_REDUCE;
927 if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0;
928 else tmp -= LOCAL_OVERFLOW_REDUCE;
929 *s = (tmp << 16) | tmp2;
930 }
931 }
932 _start = _end = 0;
933 /* the inner loop */
934 for (i = start; i < end; ++i) {
935 /* At the beginning of each cycle:
936 eh[i] -> h[j-1,i-1]<<16 | e[j,i]
937 f -> f[j,i]
938 h1 -> h[j,i-1]
939 */
940 uint32_t *s = &eh[i];
941 int h = (int)(*s >> 16);
942 int e = *s & 0xffff; /* this is e[j,i] */
943 *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */
944 h += h? score_array[i] : 0; /* this is left_core() specific */
945 /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */
946 h = h > e? h : e;
947 h = h > f? h : f; /* h now is h[j,i] */
948 h1 = h;
949 if (h > 0) {
950 if (_start == 0) _start = i;
951 _end = i;
952 if (score < h) {
953 score = h; end_i = i; end_j = j;
954 if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1;
955 }
956 }
957 /* calculate e[j+1,i] and f[j,i+1] */
958 h -= qr;
959 h = h > 0? h : 0;
960 e -= r;
961 e = e > h? e : h;
962 f -= r;
963 f = f > h? f : h;
964 *s |= e;
965 }
966 eh[end] = h1 << 16;
967 /* recalculate start and end, the boundaries of the band */
968 if (_end <= 0) break; /* no cell in this row has a positive score */
969 start = _start;
970 end = _end + 3;
971 }
972
973 score += of_base - 1;
974 if (score <= 0) {
975 if (path_len) *path_len = 0;
976 goto end_left_func;
977 }
978
979 if (path == 0) goto end_left_func;
980
981 if (path_len == 0) {
982 path[0].i = end_i; path[0].j = end_j;
983 goto end_left_func;
984 }
985
986 { /* call global alignment to fill the path */
987 int score_g = 0;
988 j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1;
989 ++j; /* j is the maximum band_width */
990 for (i = ap->band_width;; i <<= 1) {
991 AlnParam ap_real = *ap;
992 ap_real.gap_end = -1;
993 ap_real.band_width = i;
994 score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len);
995 if (score == score_g) break;
996 if (i > j) break;
997 }
998 if (score > score_g)
999 fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score);
1000 score = score_g;
1001 }
1002
1003 end_left_func:
1004 /* free */
1005 free(s_array);
1006 if (!_mem) free(mem);
1007 return score;
1008 }
1009
1010 uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar)
1011 {
1012 int i, n;
1013 uint32_t *cigar;
1014 unsigned char last_type;
1015
1016 if (path_len == 0 || path == 0) {
1017 *n_cigar = 0;
1018 return 0;
1019 }
1020
1021 last_type = path->ctype;
1022 for (i = n = 1; i < path_len; ++i) {
1023 if (last_type != path[i].ctype) ++n;
1024 last_type = path[i].ctype;
1025 }
1026 *n_cigar = n;
1027 cigar = (uint32_t*)malloc(*n_cigar * 4);
1028
1029 cigar[0] = 1u << 4 | path[path_len-1].ctype;
1030 last_type = path[path_len-1].ctype;
1031 for (i = path_len - 2, n = 0; i >= 0; --i) {
1032 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
1033 else {
1034 cigar[++n] = 1u << 4 | path[i].ctype;
1035 last_type = path[i].ctype;
1036 }
1037 }
1038
1039 return cigar;
1040 }
1041
1042 #ifdef STDALN_MAIN
1043 int main()
1044 {
1045 AlnAln *aln_local, *aln_global, *aln_left;
1046 int i;
1047
1048 aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1);
1049 aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1);
1050 // aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1);
1051 aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC",
1052 "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1);
1053
1054 printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2);
1055 printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2);
1056
1057 printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2);
1058 for (i = 0; i != aln_global->n_cigar; ++i)
1059 printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]);
1060 printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2);
1061
1062 printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2);
1063 for (i = 0; i != aln_left->n_cigar; ++i)
1064 printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]);
1065 printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2);
1066
1067 aln_free_AlnAln(aln_local);
1068 aln_free_AlnAln(aln_global);
1069 aln_free_AlnAln(aln_left);
1070 return 0;
1071 }
1072 #endif