0
|
1 /* QSufSort.c
|
|
2
|
|
3 Original source from qsufsort.c
|
|
4
|
|
5 Copyright 1999, N. Jesper Larsson, all rights reserved.
|
|
6
|
|
7 This file contains an implementation of the algorithm presented in "Faster
|
|
8 Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
|
|
9 Sadakane (sada@is.s.u-tokyo.ac.jp).
|
|
10
|
|
11 This software may be used freely for any purpose. However, when distributed,
|
|
12 the original source must be clearly stated, and, when the source code is
|
|
13 distributed, the copyright notice must be retained and any alterations in
|
|
14 the code must be clearly marked. No warranty is given regarding the quality
|
|
15 of this software.
|
|
16
|
|
17 Modified by Wong Chi-Kwong, 2004
|
|
18
|
|
19 Changes summary: - Used long variable and function names
|
|
20 - Removed global variables
|
|
21 - Replace pointer references with array references
|
|
22 - Used insertion sort in place of selection sort and increased insertion sort threshold
|
|
23 - Reconstructing suffix array from inverse becomes an option
|
|
24 - Add handling where end-of-text symbol is not necessary < all characters
|
|
25 - Removed codes for supporting alphabet size > number of characters
|
|
26
|
|
27 No warrenty is given regarding the quality of the modifications.
|
|
28
|
|
29 */
|
|
30
|
|
31
|
|
32 #include <stdio.h>
|
|
33 #include <stdlib.h>
|
|
34 #include <limits.h>
|
|
35 #include "QSufSort.h"
|
|
36
|
|
37 #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
|
|
38 #define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
|
|
39 #define swap(a, b, t); t = a; a = b; b = t;
|
|
40
|
|
41 // Static functions
|
|
42 static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
|
43 const qsint_t highestPos, const qsint_t numSortedChar);
|
|
44 static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
|
45 const qsint_t highestPos, const qsint_t numSortedChar);
|
|
46 static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
|
47 const qsint_t highestPos, const qsint_t numSortedChar);
|
|
48 static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
|
|
49 static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
|
50 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
|
|
51
|
|
52 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
|
|
53 n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
|
|
54 contents of x[n] is disregarded, the n-th symbol being regarded as
|
|
55 end-of-string smaller than all other symbols.*/
|
|
56 void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
|
57 const qsint_t smallestInputSymbol, const int skipTransform)
|
|
58 {
|
|
59 qsint_t i, j;
|
|
60 qsint_t s, negatedSortedGroupLength;
|
|
61 qsint_t numSymbolAggregated;
|
|
62 qsint_t maxNumInputSymbol;
|
|
63 qsint_t numSortedPos = 1;
|
|
64 qsint_t newAlphabetSize;
|
|
65
|
|
66 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
|
67
|
|
68 if (!skipTransform) {
|
|
69 /* bucketing possible*/
|
|
70 newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
|
|
71 numChar, &numSymbolAggregated);
|
|
72 QSufSortBucketSort(V, I, numChar, newAlphabetSize);
|
|
73 I[0] = -1;
|
|
74 V[numChar] = 0;
|
|
75 numSortedPos = numSymbolAggregated;
|
|
76 }
|
|
77
|
|
78 while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
|
|
79 i = 0;
|
|
80 negatedSortedGroupLength = 0;
|
|
81 do {
|
|
82 s = I[i];
|
|
83 if (s < 0) {
|
|
84 i -= s; /* skip over sorted group.*/
|
|
85 negatedSortedGroupLength += s;
|
|
86 } else {
|
|
87 if (negatedSortedGroupLength) {
|
|
88 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
|
|
89 negatedSortedGroupLength = 0;
|
|
90 }
|
|
91 j = V[s] + 1;
|
|
92 QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
|
|
93 i = j;
|
|
94 }
|
|
95 } while (i <= numChar);
|
|
96 if (negatedSortedGroupLength) {
|
|
97 /* array ends with a sorted group.*/
|
|
98 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
|
|
99 }
|
|
100 numSortedPos *= 2; /* double sorted-depth.*/
|
|
101 }
|
|
102 }
|
|
103
|
|
104 void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
|
|
105 {
|
|
106 qsint_t i;
|
|
107 for (i=0; i<=numChar; i++)
|
|
108 I[V[i]] = i + 1;
|
|
109 }
|
|
110
|
|
111 /* Sorting routine called for each unsorted group. Sorts the array of integers
|
|
112 (suffix numbers) of length n starting at p. The algorithm is a ternary-split
|
|
113 quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
|
|
114 Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
|
|
115 function is based on Program 7.*/
|
|
116 static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
|
117 const qsint_t highestPos, const qsint_t numSortedChar) {
|
|
118
|
|
119 qsint_t a, b, c, d;
|
|
120 qsint_t l, m;
|
|
121 qsint_t f, v, s, t;
|
|
122 qsint_t tmp;
|
|
123 qsint_t numItem;
|
|
124
|
|
125 numItem = highestPos - lowestPos + 1;
|
|
126
|
|
127 if (numItem <= INSERT_SORT_NUM_ITEM) {
|
|
128 QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
|
|
129 return;
|
|
130 }
|
|
131
|
|
132 v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
|
|
133
|
|
134 a = b = lowestPos;
|
|
135 c = d = highestPos;
|
|
136
|
|
137 while (1) {
|
|
138 while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
|
|
139 if (f == v) {
|
|
140 swap(I[a], I[b], tmp);
|
|
141 a++;
|
|
142 }
|
|
143 b++;
|
|
144 }
|
|
145 while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
|
|
146 if (f == v) {
|
|
147 swap(I[c], I[d], tmp);
|
|
148 d--;
|
|
149 }
|
|
150 c--;
|
|
151 }
|
|
152 if (b > c)
|
|
153 break;
|
|
154 swap(I[b], I[c], tmp);
|
|
155 b++;
|
|
156 c--;
|
|
157 }
|
|
158
|
|
159 s = a - lowestPos;
|
|
160 t = b - a;
|
|
161 s = min(s, t);
|
|
162 for (l = lowestPos, m = b - s; m < b; l++, m++) {
|
|
163 swap(I[l], I[m], tmp);
|
|
164 }
|
|
165
|
|
166 s = d - c;
|
|
167 t = highestPos - d;
|
|
168 s = min(s, t);
|
|
169 for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
|
|
170 swap(I[l], I[m], tmp);
|
|
171 }
|
|
172
|
|
173 s = b - a;
|
|
174 t = d - c;
|
|
175 if (s > 0)
|
|
176 QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
|
|
177
|
|
178 // Update group number for equal portion
|
|
179 a = lowestPos + s;
|
|
180 b = highestPos - t;
|
|
181 if (a == b) {
|
|
182 // Sorted group
|
|
183 V[I[a]] = a;
|
|
184 I[a] = -1;
|
|
185 } else {
|
|
186 // Unsorted group
|
|
187 for (c=a; c<=b; c++)
|
|
188 V[I[c]] = b;
|
|
189 }
|
|
190
|
|
191 if (t > 0)
|
|
192 QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
|
|
193
|
|
194 }
|
|
195
|
|
196 /* Algorithm by Bentley & McIlroy.*/
|
|
197 static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
|
198 const qsint_t highestPos, const qsint_t numSortedChar) {
|
|
199
|
|
200 qsint_t m;
|
|
201 qsint_t keyl, keym, keyn;
|
|
202 qsint_t key1, key2, key3;
|
|
203 qsint_t s;
|
|
204 qsint_t numItem;
|
|
205
|
|
206 numItem = highestPos - lowestPos + 1;
|
|
207
|
|
208 m = lowestPos + numItem / 2;
|
|
209
|
|
210 s = numItem / 8;
|
|
211 key1 = KEY(V, I, lowestPos, numSortedChar);
|
|
212 key2 = KEY(V, I, lowestPos+s, numSortedChar);
|
|
213 key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
|
|
214 keyl = med3(key1, key2, key3);
|
|
215 key1 = KEY(V, I, m-s, numSortedChar);
|
|
216 key2 = KEY(V, I, m, numSortedChar);
|
|
217 key3 = KEY(V, I, m+s, numSortedChar);
|
|
218 keym = med3(key1, key2, key3);
|
|
219 key1 = KEY(V, I, highestPos-2*s, numSortedChar);
|
|
220 key2 = KEY(V, I, highestPos-s, numSortedChar);
|
|
221 key3 = KEY(V, I, highestPos, numSortedChar);
|
|
222 keyn = med3(key1, key2, key3);
|
|
223
|
|
224 return med3(keyl, keym, keyn);
|
|
225
|
|
226
|
|
227 }
|
|
228
|
|
229 /* Quadratic sorting method to use for small subarrays. */
|
|
230 static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
|
|
231 const qsint_t highestPos, const qsint_t numSortedChar)
|
|
232 {
|
|
233 qsint_t i, j;
|
|
234 qsint_t tmpKey, tmpPos;
|
|
235 qsint_t numItem;
|
|
236 qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
|
|
237 qsint_t negativeSortedLength;
|
|
238 qsint_t groupNum;
|
|
239
|
|
240 numItem = highestPos - lowestPos + 1;
|
|
241
|
|
242 for (i=0; i<numItem; i++) {
|
|
243 pos[i] = I[lowestPos + i];
|
|
244 key[i] = V[pos[i] + numSortedChar];
|
|
245 }
|
|
246
|
|
247 for (i=1; i<numItem; i++) {
|
|
248 tmpKey = key[i];
|
|
249 tmpPos = pos[i];
|
|
250 for (j=i; j>0 && key[j-1] > tmpKey; j--) {
|
|
251 key[j] = key[j-1];
|
|
252 pos[j] = pos[j-1];
|
|
253 }
|
|
254 key[j] = tmpKey;
|
|
255 pos[j] = tmpPos;
|
|
256 }
|
|
257
|
|
258 negativeSortedLength = -1;
|
|
259
|
|
260 i = numItem - 1;
|
|
261 groupNum = highestPos;
|
|
262 while (i > 0) {
|
|
263 I[i+lowestPos] = pos[i];
|
|
264 V[I[i+lowestPos]] = groupNum;
|
|
265 if (key[i-1] == key[i]) {
|
|
266 negativeSortedLength = 0;
|
|
267 } else {
|
|
268 if (negativeSortedLength < 0)
|
|
269 I[i+lowestPos] = negativeSortedLength;
|
|
270 groupNum = i + lowestPos - 1;
|
|
271 negativeSortedLength--;
|
|
272 }
|
|
273 i--;
|
|
274 }
|
|
275
|
|
276 I[lowestPos] = pos[0];
|
|
277 V[I[lowestPos]] = groupNum;
|
|
278 if (negativeSortedLength < 0)
|
|
279 I[lowestPos] = negativeSortedLength;
|
|
280 }
|
|
281
|
|
282 /* Bucketsort for first iteration.
|
|
283
|
|
284 Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
|
|
285 at least once. x[n] is 0. (This is the corresponding output of transform.) k
|
|
286 must be at most n+1. p is array of size n+1 whose contents are disregarded.
|
|
287
|
|
288 Output: x is V and p is I after the initial sorting stage of the refined
|
|
289 suffix sorting algorithm.*/
|
|
290
|
|
291 static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
|
|
292 {
|
|
293 qsint_t i, c;
|
|
294 qsint_t d;
|
|
295 qsint_t groupNum;
|
|
296 qsint_t currentIndex;
|
|
297
|
|
298 // mark linked list empty
|
|
299 for (i=0; i<alphabetSize; i++)
|
|
300 I[i] = -1;
|
|
301
|
|
302 // insert to linked list
|
|
303 for (i=0; i<=numChar; i++) {
|
|
304 c = V[i];
|
|
305 V[i] = (qsint_t)(I[c]);
|
|
306 I[c] = i;
|
|
307 }
|
|
308
|
|
309 currentIndex = numChar;
|
|
310 for (i=alphabetSize; i>0; i--) {
|
|
311 c = I[i-1];
|
|
312 d = (qsint_t)(V[c]);
|
|
313 groupNum = currentIndex;
|
|
314 V[c] = groupNum;
|
|
315 if (d >= 0) {
|
|
316 I[currentIndex] = c;
|
|
317 while (d >= 0) {
|
|
318 c = d;
|
|
319 d = V[c];
|
|
320 V[c] = groupNum;
|
|
321 currentIndex--;
|
|
322 I[currentIndex] = c;
|
|
323 }
|
|
324 } else {
|
|
325 // sorted group
|
|
326 I[currentIndex] = -1;
|
|
327 }
|
|
328 currentIndex--;
|
|
329 }
|
|
330 }
|
|
331
|
|
332 /* Transforms the alphabet of x by attempting to aggregate several symbols into
|
|
333 one, while preserving the suffix order of x. The alphabet may also be
|
|
334 compacted, so that x on output comprises all integers of the new alphabet
|
|
335 with no skipped numbers.
|
|
336
|
|
337 Input: x is an array of size n+1 whose first n elements are positive
|
|
338 integers in the range l...k-1. p is array of size n+1, used for temporary
|
|
339 storage. q controls aggregation and compaction by defining the maximum intue
|
|
340 for any symbol during transformation: q must be at least k-l; if q<=n,
|
|
341 compaction is guaranteed; if k-l>n, compaction is never done; if q is
|
|
342 INT_MAX, the maximum number of symbols are aggregated into one.
|
|
343
|
|
344 Output: Returns an integer j in the range 1...q representing the size of the
|
|
345 new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
|
|
346 set to the number of old symbols grouped into one. Only x[n] is 0.*/
|
|
347 static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
|
|
348 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
|
|
349 {
|
|
350 qsint_t c, i, j;
|
|
351 qsint_t a; // numSymbolAggregated
|
|
352 qsint_t mask;
|
|
353 qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
|
|
354 qsint_t newAlphabetSize;
|
|
355 qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
|
|
356
|
|
357 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
|
358
|
|
359 for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
|
|
360 maxSymbol = QSINT_MAX >> maxNumBit;
|
|
361
|
|
362 c = maxNumInputSymbol;
|
|
363 for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
|
|
364 minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
|
|
365 maxSymbolInChunk = c;
|
|
366 c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
|
|
367 }
|
|
368
|
|
369 mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
|
|
370 V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
|
|
371
|
|
372 /* bucketing possible, compact alphabet.*/
|
|
373 for (i=0; i<=maxSymbolInChunk; i++)
|
|
374 I[i] = 0; /* zero transformation table.*/
|
|
375 c = minSymbolInChunk;
|
|
376 for (i=a; i<=numChar; i++) {
|
|
377 I[c] = 1; /* mark used chunk symbol.*/
|
|
378 c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
|
|
379 }
|
|
380 for (i=1; i<a; i++) { /* handle last r-1 positions.*/
|
|
381 I[c] = 1; /* mark used chunk symbol.*/
|
|
382 c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
|
|
383 }
|
|
384 newAlphabetSize = 1;
|
|
385 for (i=0; i<=maxSymbolInChunk; i++) {
|
|
386 if (I[i]) {
|
|
387 I[i] = newAlphabetSize;
|
|
388 newAlphabetSize++;
|
|
389 }
|
|
390 }
|
|
391 c = minSymbolInChunk;
|
|
392 for (i=0, j=a; j<=numChar; i++, j++) {
|
|
393 V[i] = I[c]; /* transform to new alphabet.*/
|
|
394 c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
|
|
395 }
|
|
396 for (; i<numChar; i++) { /* handle last a-1 positions.*/
|
|
397 V[i] = I[c]; /* transform to new alphabet.*/
|
|
398 c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
|
|
399 }
|
|
400
|
|
401 V[numChar] = 0; /* end-of-string symbol is zero.*/
|
|
402
|
|
403 *numSymbolAggregated = a;
|
|
404 return newAlphabetSize;
|
|
405 }
|