Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/is.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
comparison
equal
deleted
inserted
replaced
1:a9636dc1e99a | 2:a294fbfcb1db |
---|---|
1 /* | |
2 * sais.c for sais-lite | |
3 * Copyright (c) 2008 Yuta Mori All Rights Reserved. | |
4 * | |
5 * Permission is hereby granted, free of charge, to any person | |
6 * obtaining a copy of this software and associated documentation | |
7 * files (the "Software"), to deal in the Software without | |
8 * restriction, including without limitation the rights to use, | |
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 * copies of the Software, and to permit persons to whom the | |
11 * Software is furnished to do so, subject to the following | |
12 * conditions: | |
13 * | |
14 * The above copyright notice and this permission notice shall be | |
15 * included in all copies or substantial portions of the Software. | |
16 * | |
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
24 * OTHER DEALINGS IN THE SOFTWARE. | |
25 */ | |
26 | |
27 #include <stdlib.h> | |
28 | |
29 typedef unsigned char ubyte_t; | |
30 #define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) | |
31 | |
32 /* find the start or end of each bucket */ | |
33 static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) | |
34 { | |
35 int i; | |
36 for (i = 0; i < k; ++i) C[i] = 0; | |
37 for (i = 0; i < n; ++i) ++C[chr(i)]; | |
38 } | |
39 static void getBuckets(const int *C, int *B, int k, int end) | |
40 { | |
41 int i, sum = 0; | |
42 if (end) { | |
43 for (i = 0; i < k; ++i) { | |
44 sum += C[i]; | |
45 B[i] = sum; | |
46 } | |
47 } else { | |
48 for (i = 0; i < k; ++i) { | |
49 sum += C[i]; | |
50 B[i] = sum - C[i]; | |
51 } | |
52 } | |
53 } | |
54 | |
55 /* compute SA */ | |
56 static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) | |
57 { | |
58 int *b, i, j; | |
59 int c0, c1; | |
60 /* compute SAl */ | |
61 if (C == B) getCounts(T, C, n, k, cs); | |
62 getBuckets(C, B, k, 0); /* find starts of buckets */ | |
63 j = n - 1; | |
64 b = SA + B[c1 = chr(j)]; | |
65 *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; | |
66 for (i = 0; i < n; ++i) { | |
67 j = SA[i], SA[i] = ~j; | |
68 if (0 < j) { | |
69 --j; | |
70 if ((c0 = chr(j)) != c1) { | |
71 B[c1] = b - SA; | |
72 b = SA + B[c1 = c0]; | |
73 } | |
74 *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; | |
75 } | |
76 } | |
77 /* compute SAs */ | |
78 if (C == B) getCounts(T, C, n, k, cs); | |
79 getBuckets(C, B, k, 1); /* find ends of buckets */ | |
80 for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { | |
81 if (0 < (j = SA[i])) { | |
82 --j; | |
83 if ((c0 = chr(j)) != c1) { | |
84 B[c1] = b - SA; | |
85 b = SA + B[c1 = c0]; | |
86 } | |
87 *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; | |
88 } else SA[i] = ~j; | |
89 } | |
90 } | |
91 | |
92 /* | |
93 * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working | |
94 * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet | |
95 */ | |
96 static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) | |
97 { | |
98 int *C, *B, *RA; | |
99 int i, j, c, m, p, q, plen, qlen, name; | |
100 int c0, c1; | |
101 int diff; | |
102 | |
103 /* stage 1: reduce the problem by at least 1/2 sort all the | |
104 * S-substrings */ | |
105 if (k <= fs) { | |
106 C = SA + n; | |
107 B = (k <= (fs - k)) ? C + k : C; | |
108 } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; | |
109 getCounts(T, C, n, k, cs); | |
110 getBuckets(C, B, k, 1); /* find ends of buckets */ | |
111 for (i = 0; i < n; ++i) SA[i] = 0; | |
112 for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { | |
113 if ((c0 = chr(i)) < (c1 + c)) c = 1; | |
114 else if (c != 0) SA[--B[c1]] = i + 1, c = 0; | |
115 } | |
116 induceSA(T, SA, C, B, n, k, cs); | |
117 if (fs < k) free(C); | |
118 /* compact all the sorted substrings into the first m items of SA | |
119 * 2*m must be not larger than n (proveable) */ | |
120 for (i = 0, m = 0; i < n; ++i) { | |
121 p = SA[i]; | |
122 if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { | |
123 for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); | |
124 if ((j < n) && (c0 < c1)) SA[m++] = p; | |
125 } | |
126 } | |
127 for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ | |
128 /* store the length of all substrings */ | |
129 for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { | |
130 if ((c0 = chr(i)) < (c1 + c)) c = 1; | |
131 else if (c != 0) { | |
132 SA[m + ((i + 1) >> 1)] = j - i - 1; | |
133 j = i + 1; | |
134 c = 0; | |
135 } | |
136 } | |
137 /* find the lexicographic names of all substrings */ | |
138 for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { | |
139 p = SA[i], plen = SA[m + (p >> 1)], diff = 1; | |
140 if (plen == qlen) { | |
141 for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); | |
142 if (j == plen) diff = 0; | |
143 } | |
144 if (diff != 0) ++name, q = p, qlen = plen; | |
145 SA[m + (p >> 1)] = name; | |
146 } | |
147 | |
148 /* stage 2: solve the reduced problem recurse if names are not yet | |
149 * unique */ | |
150 if (name < m) { | |
151 RA = SA + n + fs - m; | |
152 for (i = n - 1, j = m - 1; m <= i; --i) { | |
153 if (SA[i] != 0) RA[j--] = SA[i] - 1; | |
154 } | |
155 if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; | |
156 for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { | |
157 if ((c0 = chr(i)) < (c1 + c)) c = 1; | |
158 else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ | |
159 } | |
160 for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ | |
161 } | |
162 /* stage 3: induce the result for the original problem */ | |
163 if (k <= fs) { | |
164 C = SA + n; | |
165 B = (k <= (fs - k)) ? C + k : C; | |
166 } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; | |
167 /* put all left-most S characters into their buckets */ | |
168 getCounts(T, C, n, k, cs); | |
169 getBuckets(C, B, k, 1); /* find ends of buckets */ | |
170 for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ | |
171 for (i = m - 1; 0 <= i; --i) { | |
172 j = SA[i], SA[i] = 0; | |
173 SA[--B[chr(j)]] = j; | |
174 } | |
175 induceSA(T, SA, C, B, n, k, cs); | |
176 if (fs < k) free(C); | |
177 return 0; | |
178 } | |
179 | |
180 /** | |
181 * Constructs the suffix array of a given string. | |
182 * @param T[0..n-1] The input string. | |
183 * @param SA[0..n] The output array of suffixes. | |
184 * @param n The length of the given string. | |
185 * @return 0 if no error occurred | |
186 */ | |
187 int is_sa(const ubyte_t *T, int *SA, int n) | |
188 { | |
189 if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; | |
190 SA[0] = n; | |
191 if (n <= 1) { | |
192 if (n == 1) SA[1] = 0; | |
193 return 0; | |
194 } | |
195 return sais_main(T, SA+1, 0, n, 256, 1); | |
196 } | |
197 | |
198 /** | |
199 * Constructs the burrows-wheeler transformed string of a given string. | |
200 * @param T[0..n-1] The input string. | |
201 * @param n The length of the given string. | |
202 * @return The primary index if no error occurred, -1 or -2 otherwise. | |
203 */ | |
204 int is_bwt(ubyte_t *T, int n) | |
205 { | |
206 int *SA, i, primary = 0; | |
207 SA = (int*)calloc(n+1, sizeof(int)); | |
208 is_sa(T, SA, n); | |
209 | |
210 for (i = 0; i <= n; ++i) { | |
211 if (SA[i] == 0) primary = i; | |
212 else SA[i] = T[SA[i] - 1]; | |
213 } | |
214 for (i = 0; i < primary; ++i) T[i] = SA[i]; | |
215 for (; i < n; ++i) T[i] = SA[i + 1]; | |
216 free(SA); | |
217 return primary; | |
218 } |