Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/QSufSort.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
comparison
equal
deleted
inserted
replaced
1:a9636dc1e99a | 2:a294fbfcb1db |
---|---|
1 /* QSufSort.c | |
2 | |
3 Original source from qsufsort.c | |
4 | |
5 Copyright 1999, N. Jesper Larsson, all rights reserved. | |
6 | |
7 This file contains an implementation of the algorithm presented in "Faster | |
8 Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko | |
9 Sadakane (sada@is.s.u-tokyo.ac.jp). | |
10 | |
11 This software may be used freely for any purpose. However, when distributed, | |
12 the original source must be clearly stated, and, when the source code is | |
13 distributed, the copyright notice must be retained and any alterations in | |
14 the code must be clearly marked. No warranty is given regarding the quality | |
15 of this software. | |
16 | |
17 Modified by Wong Chi-Kwong, 2004 | |
18 | |
19 Changes summary: - Used long variable and function names | |
20 - Removed global variables | |
21 - Replace pointer references with array references | |
22 - Used insertion sort in place of selection sort and increased insertion sort threshold | |
23 - Reconstructing suffix array from inverse becomes an option | |
24 - Add handling where end-of-text symbol is not necessary < all characters | |
25 - Removed codes for supporting alphabet size > number of characters | |
26 | |
27 No warrenty is given regarding the quality of the modifications. | |
28 | |
29 */ | |
30 | |
31 | |
32 #include <stdio.h> | |
33 #include <stdlib.h> | |
34 #include <limits.h> | |
35 #include "QSufSort.h" | |
36 | |
37 #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) | |
38 #define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a)) | |
39 #define swap(a, b, t); t = a; a = b; b = t; | |
40 | |
41 // Static functions | |
42 static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, | |
43 const qsint_t highestPos, const qsint_t numSortedChar); | |
44 static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, | |
45 const qsint_t highestPos, const qsint_t numSortedChar); | |
46 static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, | |
47 const qsint_t highestPos, const qsint_t numSortedChar); | |
48 static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); | |
49 static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, | |
50 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); | |
51 | |
52 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size | |
53 n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original | |
54 contents of x[n] is disregarded, the n-th symbol being regarded as | |
55 end-of-string smaller than all other symbols.*/ | |
56 void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, | |
57 const qsint_t smallestInputSymbol, const int skipTransform) | |
58 { | |
59 qsint_t i, j; | |
60 qsint_t s, negatedSortedGroupLength; | |
61 qsint_t numSymbolAggregated; | |
62 qsint_t maxNumInputSymbol; | |
63 qsint_t numSortedPos = 1; | |
64 qsint_t newAlphabetSize; | |
65 | |
66 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; | |
67 | |
68 if (!skipTransform) { | |
69 /* bucketing possible*/ | |
70 newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, | |
71 numChar, &numSymbolAggregated); | |
72 QSufSortBucketSort(V, I, numChar, newAlphabetSize); | |
73 I[0] = -1; | |
74 V[numChar] = 0; | |
75 numSortedPos = numSymbolAggregated; | |
76 } | |
77 | |
78 while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { | |
79 i = 0; | |
80 negatedSortedGroupLength = 0; | |
81 do { | |
82 s = I[i]; | |
83 if (s < 0) { | |
84 i -= s; /* skip over sorted group.*/ | |
85 negatedSortedGroupLength += s; | |
86 } else { | |
87 if (negatedSortedGroupLength) { | |
88 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */ | |
89 negatedSortedGroupLength = 0; | |
90 } | |
91 j = V[s] + 1; | |
92 QSufSortSortSplit(V, I, i, j - 1, numSortedPos); | |
93 i = j; | |
94 } | |
95 } while (i <= numChar); | |
96 if (negatedSortedGroupLength) { | |
97 /* array ends with a sorted group.*/ | |
98 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/ | |
99 } | |
100 numSortedPos *= 2; /* double sorted-depth.*/ | |
101 } | |
102 } | |
103 | |
104 void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) | |
105 { | |
106 qsint_t i; | |
107 for (i=0; i<=numChar; i++) | |
108 I[V[i]] = i + 1; | |
109 } | |
110 | |
111 /* Sorting routine called for each unsorted group. Sorts the array of integers | |
112 (suffix numbers) of length n starting at p. The algorithm is a ternary-split | |
113 quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", | |
114 Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This | |
115 function is based on Program 7.*/ | |
116 static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, | |
117 const qsint_t highestPos, const qsint_t numSortedChar) { | |
118 | |
119 qsint_t a, b, c, d; | |
120 qsint_t l, m; | |
121 qsint_t f, v, s, t; | |
122 qsint_t tmp; | |
123 qsint_t numItem; | |
124 | |
125 numItem = highestPos - lowestPos + 1; | |
126 | |
127 if (numItem <= INSERT_SORT_NUM_ITEM) { | |
128 QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar); | |
129 return; | |
130 } | |
131 | |
132 v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar); | |
133 | |
134 a = b = lowestPos; | |
135 c = d = highestPos; | |
136 | |
137 while (1) { | |
138 while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { | |
139 if (f == v) { | |
140 swap(I[a], I[b], tmp); | |
141 a++; | |
142 } | |
143 b++; | |
144 } | |
145 while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) { | |
146 if (f == v) { | |
147 swap(I[c], I[d], tmp); | |
148 d--; | |
149 } | |
150 c--; | |
151 } | |
152 if (b > c) | |
153 break; | |
154 swap(I[b], I[c], tmp); | |
155 b++; | |
156 c--; | |
157 } | |
158 | |
159 s = a - lowestPos; | |
160 t = b - a; | |
161 s = min(s, t); | |
162 for (l = lowestPos, m = b - s; m < b; l++, m++) { | |
163 swap(I[l], I[m], tmp); | |
164 } | |
165 | |
166 s = d - c; | |
167 t = highestPos - d; | |
168 s = min(s, t); | |
169 for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) { | |
170 swap(I[l], I[m], tmp); | |
171 } | |
172 | |
173 s = b - a; | |
174 t = d - c; | |
175 if (s > 0) | |
176 QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); | |
177 | |
178 // Update group number for equal portion | |
179 a = lowestPos + s; | |
180 b = highestPos - t; | |
181 if (a == b) { | |
182 // Sorted group | |
183 V[I[a]] = a; | |
184 I[a] = -1; | |
185 } else { | |
186 // Unsorted group | |
187 for (c=a; c<=b; c++) | |
188 V[I[c]] = b; | |
189 } | |
190 | |
191 if (t > 0) | |
192 QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); | |
193 | |
194 } | |
195 | |
196 /* Algorithm by Bentley & McIlroy.*/ | |
197 static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, | |
198 const qsint_t highestPos, const qsint_t numSortedChar) { | |
199 | |
200 qsint_t m; | |
201 qsint_t keyl, keym, keyn; | |
202 qsint_t key1, key2, key3; | |
203 qsint_t s; | |
204 qsint_t numItem; | |
205 | |
206 numItem = highestPos - lowestPos + 1; | |
207 | |
208 m = lowestPos + numItem / 2; | |
209 | |
210 s = numItem / 8; | |
211 key1 = KEY(V, I, lowestPos, numSortedChar); | |
212 key2 = KEY(V, I, lowestPos+s, numSortedChar); | |
213 key3 = KEY(V, I, lowestPos+2*s, numSortedChar); | |
214 keyl = med3(key1, key2, key3); | |
215 key1 = KEY(V, I, m-s, numSortedChar); | |
216 key2 = KEY(V, I, m, numSortedChar); | |
217 key3 = KEY(V, I, m+s, numSortedChar); | |
218 keym = med3(key1, key2, key3); | |
219 key1 = KEY(V, I, highestPos-2*s, numSortedChar); | |
220 key2 = KEY(V, I, highestPos-s, numSortedChar); | |
221 key3 = KEY(V, I, highestPos, numSortedChar); | |
222 keyn = med3(key1, key2, key3); | |
223 | |
224 return med3(keyl, keym, keyn); | |
225 | |
226 | |
227 } | |
228 | |
229 /* Quadratic sorting method to use for small subarrays. */ | |
230 static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, | |
231 const qsint_t highestPos, const qsint_t numSortedChar) | |
232 { | |
233 qsint_t i, j; | |
234 qsint_t tmpKey, tmpPos; | |
235 qsint_t numItem; | |
236 qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; | |
237 qsint_t negativeSortedLength; | |
238 qsint_t groupNum; | |
239 | |
240 numItem = highestPos - lowestPos + 1; | |
241 | |
242 for (i=0; i<numItem; i++) { | |
243 pos[i] = I[lowestPos + i]; | |
244 key[i] = V[pos[i] + numSortedChar]; | |
245 } | |
246 | |
247 for (i=1; i<numItem; i++) { | |
248 tmpKey = key[i]; | |
249 tmpPos = pos[i]; | |
250 for (j=i; j>0 && key[j-1] > tmpKey; j--) { | |
251 key[j] = key[j-1]; | |
252 pos[j] = pos[j-1]; | |
253 } | |
254 key[j] = tmpKey; | |
255 pos[j] = tmpPos; | |
256 } | |
257 | |
258 negativeSortedLength = -1; | |
259 | |
260 i = numItem - 1; | |
261 groupNum = highestPos; | |
262 while (i > 0) { | |
263 I[i+lowestPos] = pos[i]; | |
264 V[I[i+lowestPos]] = groupNum; | |
265 if (key[i-1] == key[i]) { | |
266 negativeSortedLength = 0; | |
267 } else { | |
268 if (negativeSortedLength < 0) | |
269 I[i+lowestPos] = negativeSortedLength; | |
270 groupNum = i + lowestPos - 1; | |
271 negativeSortedLength--; | |
272 } | |
273 i--; | |
274 } | |
275 | |
276 I[lowestPos] = pos[0]; | |
277 V[I[lowestPos]] = groupNum; | |
278 if (negativeSortedLength < 0) | |
279 I[lowestPos] = negativeSortedLength; | |
280 } | |
281 | |
282 /* Bucketsort for first iteration. | |
283 | |
284 Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear | |
285 at least once. x[n] is 0. (This is the corresponding output of transform.) k | |
286 must be at most n+1. p is array of size n+1 whose contents are disregarded. | |
287 | |
288 Output: x is V and p is I after the initial sorting stage of the refined | |
289 suffix sorting algorithm.*/ | |
290 | |
291 static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) | |
292 { | |
293 qsint_t i, c; | |
294 qsint_t d; | |
295 qsint_t groupNum; | |
296 qsint_t currentIndex; | |
297 | |
298 // mark linked list empty | |
299 for (i=0; i<alphabetSize; i++) | |
300 I[i] = -1; | |
301 | |
302 // insert to linked list | |
303 for (i=0; i<=numChar; i++) { | |
304 c = V[i]; | |
305 V[i] = (qsint_t)(I[c]); | |
306 I[c] = i; | |
307 } | |
308 | |
309 currentIndex = numChar; | |
310 for (i=alphabetSize; i>0; i--) { | |
311 c = I[i-1]; | |
312 d = (qsint_t)(V[c]); | |
313 groupNum = currentIndex; | |
314 V[c] = groupNum; | |
315 if (d >= 0) { | |
316 I[currentIndex] = c; | |
317 while (d >= 0) { | |
318 c = d; | |
319 d = V[c]; | |
320 V[c] = groupNum; | |
321 currentIndex--; | |
322 I[currentIndex] = c; | |
323 } | |
324 } else { | |
325 // sorted group | |
326 I[currentIndex] = -1; | |
327 } | |
328 currentIndex--; | |
329 } | |
330 } | |
331 | |
332 /* Transforms the alphabet of x by attempting to aggregate several symbols into | |
333 one, while preserving the suffix order of x. The alphabet may also be | |
334 compacted, so that x on output comprises all integers of the new alphabet | |
335 with no skipped numbers. | |
336 | |
337 Input: x is an array of size n+1 whose first n elements are positive | |
338 integers in the range l...k-1. p is array of size n+1, used for temporary | |
339 storage. q controls aggregation and compaction by defining the maximum intue | |
340 for any symbol during transformation: q must be at least k-l; if q<=n, | |
341 compaction is guaranteed; if k-l>n, compaction is never done; if q is | |
342 INT_MAX, the maximum number of symbols are aggregated into one. | |
343 | |
344 Output: Returns an integer j in the range 1...q representing the size of the | |
345 new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is | |
346 set to the number of old symbols grouped into one. Only x[n] is 0.*/ | |
347 static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, | |
348 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) | |
349 { | |
350 qsint_t c, i, j; | |
351 qsint_t a; // numSymbolAggregated | |
352 qsint_t mask; | |
353 qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; | |
354 qsint_t newAlphabetSize; | |
355 qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; | |
356 | |
357 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; | |
358 | |
359 for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; | |
360 maxSymbol = QSINT_MAX >> maxNumBit; | |
361 | |
362 c = maxNumInputSymbol; | |
363 for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { | |
364 minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1); | |
365 maxSymbolInChunk = c; | |
366 c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol; | |
367 } | |
368 | |
369 mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ | |
370 V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ | |
371 | |
372 /* bucketing possible, compact alphabet.*/ | |
373 for (i=0; i<=maxSymbolInChunk; i++) | |
374 I[i] = 0; /* zero transformation table.*/ | |
375 c = minSymbolInChunk; | |
376 for (i=a; i<=numChar; i++) { | |
377 I[c] = 1; /* mark used chunk symbol.*/ | |
378 c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ | |
379 } | |
380 for (i=1; i<a; i++) { /* handle last r-1 positions.*/ | |
381 I[c] = 1; /* mark used chunk symbol.*/ | |
382 c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/ | |
383 } | |
384 newAlphabetSize = 1; | |
385 for (i=0; i<=maxSymbolInChunk; i++) { | |
386 if (I[i]) { | |
387 I[i] = newAlphabetSize; | |
388 newAlphabetSize++; | |
389 } | |
390 } | |
391 c = minSymbolInChunk; | |
392 for (i=0, j=a; j<=numChar; i++, j++) { | |
393 V[i] = I[c]; /* transform to new alphabet.*/ | |
394 c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ | |
395 } | |
396 for (; i<numChar; i++) { /* handle last a-1 positions.*/ | |
397 V[i] = I[c]; /* transform to new alphabet.*/ | |
398 c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/ | |
399 } | |
400 | |
401 V[numChar] = 0; /* end-of-string symbol is zero.*/ | |
402 | |
403 *numSymbolAggregated = a; | |
404 return newAlphabetSize; | |
405 } |