comparison bwa-0.6.2/QSufSort.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
comparison
equal deleted inserted replaced
1:a9636dc1e99a 2:a294fbfcb1db
1 /* QSufSort.c
2
3 Original source from qsufsort.c
4
5 Copyright 1999, N. Jesper Larsson, all rights reserved.
6
7 This file contains an implementation of the algorithm presented in "Faster
8 Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9 Sadakane (sada@is.s.u-tokyo.ac.jp).
10
11 This software may be used freely for any purpose. However, when distributed,
12 the original source must be clearly stated, and, when the source code is
13 distributed, the copyright notice must be retained and any alterations in
14 the code must be clearly marked. No warranty is given regarding the quality
15 of this software.
16
17 Modified by Wong Chi-Kwong, 2004
18
19 Changes summary: - Used long variable and function names
20 - Removed global variables
21 - Replace pointer references with array references
22 - Used insertion sort in place of selection sort and increased insertion sort threshold
23 - Reconstructing suffix array from inverse becomes an option
24 - Add handling where end-of-text symbol is not necessary < all characters
25 - Removed codes for supporting alphabet size > number of characters
26
27 No warrenty is given regarding the quality of the modifications.
28
29 */
30
31
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <limits.h>
35 #include "QSufSort.h"
36
37 #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
38 #define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
39 #define swap(a, b, t); t = a; a = b; b = t;
40
41 // Static functions
42 static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
43 const qsint_t highestPos, const qsint_t numSortedChar);
44 static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
45 const qsint_t highestPos, const qsint_t numSortedChar);
46 static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
47 const qsint_t highestPos, const qsint_t numSortedChar);
48 static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
49 static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
50 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
51
52 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
53 n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
54 contents of x[n] is disregarded, the n-th symbol being regarded as
55 end-of-string smaller than all other symbols.*/
56 void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
57 const qsint_t smallestInputSymbol, const int skipTransform)
58 {
59 qsint_t i, j;
60 qsint_t s, negatedSortedGroupLength;
61 qsint_t numSymbolAggregated;
62 qsint_t maxNumInputSymbol;
63 qsint_t numSortedPos = 1;
64 qsint_t newAlphabetSize;
65
66 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
67
68 if (!skipTransform) {
69 /* bucketing possible*/
70 newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
71 numChar, &numSymbolAggregated);
72 QSufSortBucketSort(V, I, numChar, newAlphabetSize);
73 I[0] = -1;
74 V[numChar] = 0;
75 numSortedPos = numSymbolAggregated;
76 }
77
78 while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
79 i = 0;
80 negatedSortedGroupLength = 0;
81 do {
82 s = I[i];
83 if (s < 0) {
84 i -= s; /* skip over sorted group.*/
85 negatedSortedGroupLength += s;
86 } else {
87 if (negatedSortedGroupLength) {
88 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
89 negatedSortedGroupLength = 0;
90 }
91 j = V[s] + 1;
92 QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
93 i = j;
94 }
95 } while (i <= numChar);
96 if (negatedSortedGroupLength) {
97 /* array ends with a sorted group.*/
98 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
99 }
100 numSortedPos *= 2; /* double sorted-depth.*/
101 }
102 }
103
104 void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
105 {
106 qsint_t i;
107 for (i=0; i<=numChar; i++)
108 I[V[i]] = i + 1;
109 }
110
111 /* Sorting routine called for each unsorted group. Sorts the array of integers
112 (suffix numbers) of length n starting at p. The algorithm is a ternary-split
113 quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
114 Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
115 function is based on Program 7.*/
116 static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
117 const qsint_t highestPos, const qsint_t numSortedChar) {
118
119 qsint_t a, b, c, d;
120 qsint_t l, m;
121 qsint_t f, v, s, t;
122 qsint_t tmp;
123 qsint_t numItem;
124
125 numItem = highestPos - lowestPos + 1;
126
127 if (numItem <= INSERT_SORT_NUM_ITEM) {
128 QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
129 return;
130 }
131
132 v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
133
134 a = b = lowestPos;
135 c = d = highestPos;
136
137 while (1) {
138 while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
139 if (f == v) {
140 swap(I[a], I[b], tmp);
141 a++;
142 }
143 b++;
144 }
145 while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
146 if (f == v) {
147 swap(I[c], I[d], tmp);
148 d--;
149 }
150 c--;
151 }
152 if (b > c)
153 break;
154 swap(I[b], I[c], tmp);
155 b++;
156 c--;
157 }
158
159 s = a - lowestPos;
160 t = b - a;
161 s = min(s, t);
162 for (l = lowestPos, m = b - s; m < b; l++, m++) {
163 swap(I[l], I[m], tmp);
164 }
165
166 s = d - c;
167 t = highestPos - d;
168 s = min(s, t);
169 for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
170 swap(I[l], I[m], tmp);
171 }
172
173 s = b - a;
174 t = d - c;
175 if (s > 0)
176 QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
177
178 // Update group number for equal portion
179 a = lowestPos + s;
180 b = highestPos - t;
181 if (a == b) {
182 // Sorted group
183 V[I[a]] = a;
184 I[a] = -1;
185 } else {
186 // Unsorted group
187 for (c=a; c<=b; c++)
188 V[I[c]] = b;
189 }
190
191 if (t > 0)
192 QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
193
194 }
195
196 /* Algorithm by Bentley & McIlroy.*/
197 static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
198 const qsint_t highestPos, const qsint_t numSortedChar) {
199
200 qsint_t m;
201 qsint_t keyl, keym, keyn;
202 qsint_t key1, key2, key3;
203 qsint_t s;
204 qsint_t numItem;
205
206 numItem = highestPos - lowestPos + 1;
207
208 m = lowestPos + numItem / 2;
209
210 s = numItem / 8;
211 key1 = KEY(V, I, lowestPos, numSortedChar);
212 key2 = KEY(V, I, lowestPos+s, numSortedChar);
213 key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
214 keyl = med3(key1, key2, key3);
215 key1 = KEY(V, I, m-s, numSortedChar);
216 key2 = KEY(V, I, m, numSortedChar);
217 key3 = KEY(V, I, m+s, numSortedChar);
218 keym = med3(key1, key2, key3);
219 key1 = KEY(V, I, highestPos-2*s, numSortedChar);
220 key2 = KEY(V, I, highestPos-s, numSortedChar);
221 key3 = KEY(V, I, highestPos, numSortedChar);
222 keyn = med3(key1, key2, key3);
223
224 return med3(keyl, keym, keyn);
225
226
227 }
228
229 /* Quadratic sorting method to use for small subarrays. */
230 static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
231 const qsint_t highestPos, const qsint_t numSortedChar)
232 {
233 qsint_t i, j;
234 qsint_t tmpKey, tmpPos;
235 qsint_t numItem;
236 qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
237 qsint_t negativeSortedLength;
238 qsint_t groupNum;
239
240 numItem = highestPos - lowestPos + 1;
241
242 for (i=0; i<numItem; i++) {
243 pos[i] = I[lowestPos + i];
244 key[i] = V[pos[i] + numSortedChar];
245 }
246
247 for (i=1; i<numItem; i++) {
248 tmpKey = key[i];
249 tmpPos = pos[i];
250 for (j=i; j>0 && key[j-1] > tmpKey; j--) {
251 key[j] = key[j-1];
252 pos[j] = pos[j-1];
253 }
254 key[j] = tmpKey;
255 pos[j] = tmpPos;
256 }
257
258 negativeSortedLength = -1;
259
260 i = numItem - 1;
261 groupNum = highestPos;
262 while (i > 0) {
263 I[i+lowestPos] = pos[i];
264 V[I[i+lowestPos]] = groupNum;
265 if (key[i-1] == key[i]) {
266 negativeSortedLength = 0;
267 } else {
268 if (negativeSortedLength < 0)
269 I[i+lowestPos] = negativeSortedLength;
270 groupNum = i + lowestPos - 1;
271 negativeSortedLength--;
272 }
273 i--;
274 }
275
276 I[lowestPos] = pos[0];
277 V[I[lowestPos]] = groupNum;
278 if (negativeSortedLength < 0)
279 I[lowestPos] = negativeSortedLength;
280 }
281
282 /* Bucketsort for first iteration.
283
284 Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
285 at least once. x[n] is 0. (This is the corresponding output of transform.) k
286 must be at most n+1. p is array of size n+1 whose contents are disregarded.
287
288 Output: x is V and p is I after the initial sorting stage of the refined
289 suffix sorting algorithm.*/
290
291 static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
292 {
293 qsint_t i, c;
294 qsint_t d;
295 qsint_t groupNum;
296 qsint_t currentIndex;
297
298 // mark linked list empty
299 for (i=0; i<alphabetSize; i++)
300 I[i] = -1;
301
302 // insert to linked list
303 for (i=0; i<=numChar; i++) {
304 c = V[i];
305 V[i] = (qsint_t)(I[c]);
306 I[c] = i;
307 }
308
309 currentIndex = numChar;
310 for (i=alphabetSize; i>0; i--) {
311 c = I[i-1];
312 d = (qsint_t)(V[c]);
313 groupNum = currentIndex;
314 V[c] = groupNum;
315 if (d >= 0) {
316 I[currentIndex] = c;
317 while (d >= 0) {
318 c = d;
319 d = V[c];
320 V[c] = groupNum;
321 currentIndex--;
322 I[currentIndex] = c;
323 }
324 } else {
325 // sorted group
326 I[currentIndex] = -1;
327 }
328 currentIndex--;
329 }
330 }
331
332 /* Transforms the alphabet of x by attempting to aggregate several symbols into
333 one, while preserving the suffix order of x. The alphabet may also be
334 compacted, so that x on output comprises all integers of the new alphabet
335 with no skipped numbers.
336
337 Input: x is an array of size n+1 whose first n elements are positive
338 integers in the range l...k-1. p is array of size n+1, used for temporary
339 storage. q controls aggregation and compaction by defining the maximum intue
340 for any symbol during transformation: q must be at least k-l; if q<=n,
341 compaction is guaranteed; if k-l>n, compaction is never done; if q is
342 INT_MAX, the maximum number of symbols are aggregated into one.
343
344 Output: Returns an integer j in the range 1...q representing the size of the
345 new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
346 set to the number of old symbols grouped into one. Only x[n] is 0.*/
347 static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
348 const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
349 {
350 qsint_t c, i, j;
351 qsint_t a; // numSymbolAggregated
352 qsint_t mask;
353 qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
354 qsint_t newAlphabetSize;
355 qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
356
357 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
358
359 for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
360 maxSymbol = QSINT_MAX >> maxNumBit;
361
362 c = maxNumInputSymbol;
363 for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
364 minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
365 maxSymbolInChunk = c;
366 c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
367 }
368
369 mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
370 V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
371
372 /* bucketing possible, compact alphabet.*/
373 for (i=0; i<=maxSymbolInChunk; i++)
374 I[i] = 0; /* zero transformation table.*/
375 c = minSymbolInChunk;
376 for (i=a; i<=numChar; i++) {
377 I[c] = 1; /* mark used chunk symbol.*/
378 c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
379 }
380 for (i=1; i<a; i++) { /* handle last r-1 positions.*/
381 I[c] = 1; /* mark used chunk symbol.*/
382 c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
383 }
384 newAlphabetSize = 1;
385 for (i=0; i<=maxSymbolInChunk; i++) {
386 if (I[i]) {
387 I[i] = newAlphabetSize;
388 newAlphabetSize++;
389 }
390 }
391 c = minSymbolInChunk;
392 for (i=0, j=a; j<=numChar; i++, j++) {
393 V[i] = I[c]; /* transform to new alphabet.*/
394 c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
395 }
396 for (; i<numChar; i++) { /* handle last a-1 positions.*/
397 V[i] = I[c]; /* transform to new alphabet.*/
398 c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
399 }
400
401 V[numChar] = 0; /* end-of-string symbol is zero.*/
402
403 *numSymbolAggregated = a;
404 return newAlphabetSize;
405 }