0
|
1 /*
|
|
2
|
|
3 BWTConstruct.c BWT-Index Construction
|
|
4
|
|
5 This module constructs BWT and auxiliary data structures.
|
|
6
|
|
7 Copyright (C) 2004, Wong Chi Kwong.
|
|
8
|
|
9 This program is free software; you can redistribute it and/or
|
|
10 modify it under the terms of the GNU General Public License
|
|
11 as published by the Free Software Foundation; either version 2
|
|
12 of the License, or (at your option) any later version.
|
|
13
|
|
14 This program is distributed in the hope that it will be useful,
|
|
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17 GNU General Public License for more details.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License
|
|
20 along with this program; if not, write to the Free Software
|
|
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
22
|
|
23 */
|
|
24
|
|
25 #include <stdio.h>
|
|
26 #include <stdlib.h>
|
|
27 #include <string.h>
|
|
28 #include "bwt_gen.h"
|
|
29 #include "QSufSort.h"
|
|
30
|
|
31 static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar,
|
|
32 unsigned int lastByteLength)
|
|
33 {
|
|
34 if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) {
|
|
35 fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n");
|
|
36 exit(1);
|
|
37 }
|
|
38 return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
|
|
39 }
|
|
40
|
|
41 static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue)
|
|
42 {
|
|
43 unsigned int i;
|
|
44 for (i=0; i<length; i++) startAddr[i] = initValue;
|
|
45 }
|
|
46
|
|
47 static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
|
|
48 {
|
|
49 unsigned int i, j, c, t;
|
|
50
|
|
51 for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
|
|
52 dnaDecodeTable[i] = 0;
|
|
53 c = i;
|
|
54 for (j=0; j<8; j++) {
|
|
55 t = c & 0x00000003;
|
|
56 dnaDecodeTable[i] += 1 << (t * 8);
|
|
57 c >>= 2;
|
|
58 }
|
|
59 }
|
|
60
|
|
61 }
|
|
62 // for BWTIncCreate()
|
|
63 static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar)
|
|
64 {
|
|
65 unsigned int numOfOccValue;
|
|
66 unsigned int numOfOccIntervalPerMajor;
|
|
67 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
|
68 numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
|
|
69 return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
|
|
70 }
|
|
71 // for BWTIncCreate()
|
|
72 static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar)
|
|
73 {
|
|
74 unsigned int numOfOccValue;
|
|
75 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
|
76 return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
|
|
77 }
|
|
78 // for BWTIncCreate()
|
|
79 static unsigned int BWTResidentSizeInWord(const unsigned int numChar) {
|
|
80
|
|
81 unsigned int numCharRoundUpToOccInterval;
|
|
82
|
|
83 // The $ in BWT at the position of inverseSa0 is not encoded
|
|
84 numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
|
|
85
|
|
86 return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
|
|
87
|
|
88 }
|
|
89
|
|
90 static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
|
|
91 {
|
|
92 unsigned int maxBuildSize;
|
|
93
|
|
94 if (bwtInc->bwt->textLength == 0) {
|
|
95 // initial build
|
|
96 // Minus 2 because n+1 entries of seq and rank needed for n char
|
|
97 maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD)
|
|
98 / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD;
|
|
99 if (bwtInc->initialMaxBuildSize > 0) {
|
|
100 bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
|
|
101 } else {
|
|
102 bwtInc->buildSize = maxBuildSize;
|
|
103 }
|
|
104 } else {
|
|
105 // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
|
|
106 // Minus numberOfIterationDone because bwt slightly shift to left in each iteration
|
|
107 maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3
|
|
108 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR)
|
|
109 / 3;
|
|
110 if (maxBuildSize < CHAR_PER_WORD) {
|
|
111 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
|
|
112 exit(1);
|
|
113 }
|
|
114 if (bwtInc->incMaxBuildSize > 0) {
|
|
115 bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
|
|
116 } else {
|
|
117 bwtInc->buildSize = maxBuildSize;
|
|
118 }
|
|
119 if (bwtInc->buildSize < CHAR_PER_WORD) {
|
|
120 bwtInc->buildSize = CHAR_PER_WORD;
|
|
121 }
|
|
122 }
|
|
123
|
|
124 if (bwtInc->buildSize < CHAR_PER_WORD) {
|
|
125 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
|
|
126 exit(1);
|
|
127 }
|
|
128
|
|
129 bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
|
|
130
|
|
131 bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1);
|
|
132 bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1);
|
|
133
|
|
134 }
|
|
135
|
|
136 // for ceilLog2()
|
|
137 unsigned int leadingZero(const unsigned int input)
|
|
138 {
|
|
139 unsigned int l;
|
|
140 const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
|
141 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
146 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
147 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
|
148
|
|
149 if (input & 0xFFFF0000) {
|
|
150 if (input & 0xFF000000) {
|
|
151 l = leadingZero8bit[input >> 24];
|
|
152 } else {
|
|
153 l = 8 + leadingZero8bit[input >> 16];
|
|
154 }
|
|
155 } else {
|
|
156 if (input & 0x0000FF00) {
|
|
157 l = 16 + leadingZero8bit[input >> 8];
|
|
158 } else {
|
|
159 l = 24 + leadingZero8bit[input];
|
|
160 }
|
|
161 }
|
|
162 return l;
|
|
163
|
|
164 }
|
|
165 // for BitPerBytePackedChar()
|
|
166 static unsigned int ceilLog2(const unsigned int input)
|
|
167 {
|
|
168 if (input <= 1) return 0;
|
|
169 return BITS_IN_WORD - leadingZero(input - 1);
|
|
170
|
|
171 }
|
|
172 // for ConvertBytePackedToWordPacked()
|
|
173 static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
|
|
174 {
|
|
175 unsigned int bitPerChar;
|
|
176 bitPerChar = ceilLog2(alphabetSize);
|
|
177 // Return the largest number of bit that does not affect packing efficiency
|
|
178 if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
|
|
179 bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
|
|
180 return bitPerChar;
|
|
181 }
|
|
182 // for ConvertBytePackedToWordPacked()
|
|
183 static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
|
|
184 {
|
|
185 return ceilLog2(alphabetSize);
|
|
186 }
|
|
187
|
|
188 static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
|
|
189 const unsigned int textLength)
|
|
190 {
|
|
191 unsigned int i, j, k;
|
|
192 unsigned int c;
|
|
193 unsigned int bitPerBytePackedChar;
|
|
194 unsigned int bitPerWordPackedChar;
|
|
195 unsigned int charPerWord;
|
|
196 unsigned int charPerByte;
|
|
197 unsigned int bytePerIteration;
|
|
198 unsigned int byteProcessed = 0;
|
|
199 unsigned int wordProcessed = 0;
|
|
200 unsigned int mask, shift;
|
|
201
|
|
202 unsigned int buffer[BITS_IN_WORD];
|
|
203
|
|
204 bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
|
|
205 bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
|
|
206 charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
|
|
207 charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
|
|
208
|
|
209 bytePerIteration = charPerWord / charPerByte;
|
|
210 mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
|
|
211 shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
|
|
212
|
|
213 while ((wordProcessed + 1) * charPerWord < textLength) {
|
|
214
|
|
215 k = 0;
|
|
216 for (i=0; i<bytePerIteration; i++) {
|
|
217 c = (unsigned int)input[byteProcessed] << shift;
|
|
218 for (j=0; j<charPerByte; j++) {
|
|
219 buffer[k] = c & mask;
|
|
220 c <<= bitPerBytePackedChar;
|
|
221 k++;
|
|
222 }
|
|
223 byteProcessed++;
|
|
224 }
|
|
225
|
|
226 c = 0;
|
|
227 for (i=0; i<charPerWord; i++) {
|
|
228 c |= buffer[i] >> bitPerWordPackedChar * i;
|
|
229 }
|
|
230 output[wordProcessed] = c;
|
|
231 wordProcessed++;
|
|
232
|
|
233 }
|
|
234
|
|
235 k = 0;
|
|
236 for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
|
|
237 c = (unsigned int)input[byteProcessed] << shift;
|
|
238 for (j=0; j<charPerByte; j++) {
|
|
239 buffer[k] = c & mask;
|
|
240 c <<= bitPerBytePackedChar;
|
|
241 k++;
|
|
242 }
|
|
243 byteProcessed++;
|
|
244 }
|
|
245
|
|
246 c = 0;
|
|
247 for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
|
|
248 c |= buffer[i] >> bitPerWordPackedChar * i;
|
|
249 }
|
|
250 output[wordProcessed] = c;
|
|
251 }
|
|
252
|
|
253 BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable)
|
|
254 {
|
|
255 BWT *bwt;
|
|
256
|
|
257 bwt = (BWT*)calloc(1, sizeof(BWT));
|
|
258
|
|
259 bwt->textLength = 0;
|
|
260 bwt->inverseSa = 0;
|
|
261
|
|
262 bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*));
|
|
263 initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
|
|
264
|
|
265 bwt->bwtSizeInWord = 0;
|
|
266 bwt->saValueOnBoundary = NULL;
|
|
267
|
|
268 // Generate decode tables
|
|
269 if (decodeTable == NULL) {
|
|
270 bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
|
|
271 GenerateDNAOccCountTable(bwt->decodeTable);
|
|
272 } else {
|
|
273 bwt->decodeTable = decodeTable;
|
|
274 }
|
|
275
|
|
276 bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
|
|
277 bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int));
|
|
278
|
|
279 bwt->occSizeInWord = 0;
|
|
280 bwt->occValue = NULL;
|
|
281
|
|
282 bwt->saInterval = ALL_ONE_MASK;
|
|
283 bwt->saValueSize = 0;
|
|
284 bwt->saValue = NULL;
|
|
285
|
|
286 bwt->inverseSaInterval = ALL_ONE_MASK;
|
|
287 bwt->inverseSaSize = 0;
|
|
288 bwt->inverseSa = NULL;
|
|
289
|
|
290 return bwt;
|
|
291 }
|
|
292
|
|
293 BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit,
|
|
294 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
|
|
295 {
|
|
296 BWTInc *bwtInc;
|
|
297 unsigned int i;
|
|
298
|
|
299 if (targetNBit == 0) {
|
|
300 fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n");
|
|
301 exit(1);
|
|
302 }
|
|
303
|
|
304 bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
|
|
305 bwtInc->numberOfIterationDone = 0;
|
|
306 bwtInc->bwt = BWTCreate(textLength, NULL);
|
|
307 bwtInc->initialMaxBuildSize = initialMaxBuildSize;
|
|
308 bwtInc->incMaxBuildSize = incMaxBuildSize;
|
|
309 bwtInc->targetNBit = targetNBit;
|
|
310 bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int));
|
|
311 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
|
312
|
|
313 // Build frequently accessed data
|
|
314 bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
|
|
315 for (i=0; i<CHAR_PER_WORD; i++) {
|
|
316 bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
|
|
317 }
|
|
318
|
|
319 bwtInc->targetTextLength = textLength;
|
|
320 bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit);
|
|
321 if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) {
|
|
322 fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n");
|
|
323 exit(1);
|
|
324 }
|
|
325 bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
|
|
326
|
|
327 return bwtInc;
|
|
328
|
|
329 }
|
|
330 // for BWTIncConstruct()
|
|
331 static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank,
|
|
332 unsigned int* __restrict cumulativeCount, const unsigned int numChar)
|
|
333 {
|
|
334 unsigned int i, j;
|
|
335 unsigned int c, t;
|
|
336 unsigned int packedMask;
|
|
337 unsigned int rankIndex;
|
|
338 unsigned int lastWord, numCharInLastWord;
|
|
339
|
|
340 lastWord = (numChar - 1) / CHAR_PER_WORD;
|
|
341 numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
|
|
342
|
|
343 packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
|
|
344 rankIndex = numChar - 1;
|
|
345
|
|
346 t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
|
|
347 for (i=0; i<numCharInLastWord; i++) {
|
|
348 c = t & packedMask;
|
|
349 cumulativeCount[c+1]++;
|
|
350 rank[rankIndex] = c;
|
|
351 rankIndex--;
|
|
352 t >>= BIT_PER_CHAR;
|
|
353 }
|
|
354
|
|
355 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
|
|
356 t = packedText[i];
|
|
357 for (j=0; j<CHAR_PER_WORD; j++) {
|
|
358 c = t & packedMask;
|
|
359 cumulativeCount[c+1]++;
|
|
360 rank[rankIndex] = c;
|
|
361 rankIndex--;
|
|
362 t >>= BIT_PER_CHAR;
|
|
363 }
|
|
364 }
|
|
365
|
|
366 // Convert occurrence to cumulativeCount
|
|
367 cumulativeCount[2] += cumulativeCount[1];
|
|
368 cumulativeCount[3] += cumulativeCount[2];
|
|
369 cumulativeCount[4] += cumulativeCount[3];
|
|
370 }
|
|
371
|
|
372
|
|
373 static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index,
|
|
374 unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable)
|
|
375 {
|
|
376 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
|
|
377 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
|
|
378 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
|
|
379 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
|
|
380
|
|
381 unsigned int iteration, wordToCount, charToCount;
|
|
382 unsigned int i, j, c;
|
|
383 unsigned int sum;
|
|
384
|
|
385 occCount[0] = 0;
|
|
386 occCount[1] = 0;
|
|
387 occCount[2] = 0;
|
|
388 occCount[3] = 0;
|
|
389
|
|
390 iteration = index / 256;
|
|
391 wordToCount = (index - iteration * 256) / 16;
|
|
392 charToCount = index - iteration * 256 - wordToCount * 16;
|
|
393
|
|
394 for (i=0; i<iteration; i++) {
|
|
395
|
|
396 sum = 0;
|
|
397 for (j=0; j<16; j++) {
|
|
398 sum += dnaDecodeTable[*dna >> 16];
|
|
399 sum += dnaDecodeTable[*dna & 0x0000FFFF];
|
|
400 dna++;
|
|
401 }
|
|
402 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
|
403 occCount[0] += sum & 0x000000FF; sum >>= 8;
|
|
404 occCount[1] += sum & 0x000000FF; sum >>= 8;
|
|
405 occCount[2] += sum & 0x000000FF; sum >>= 8;
|
|
406 occCount[3] += sum;
|
|
407 } else {
|
|
408 // only some or all of the 3 bits are on
|
|
409 // in reality, only one of the four cases are possible
|
|
410 if (sum == 0x00000100) {
|
|
411 occCount[0] += 256;
|
|
412 } else if (sum == 0x00010000) {
|
|
413 occCount[1] += 256;
|
|
414 } else if (sum == 0x01000000) {
|
|
415 occCount[2] += 256;
|
|
416 } else if (sum == 0x00000000) {
|
|
417 occCount[3] += 256;
|
|
418 } else {
|
|
419 fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
|
|
420 exit(1);
|
|
421 }
|
|
422 }
|
|
423
|
|
424 }
|
|
425
|
|
426 sum = 0;
|
|
427 for (j=0; j<wordToCount; j++) {
|
|
428 sum += dnaDecodeTable[*dna >> 16];
|
|
429 sum += dnaDecodeTable[*dna & 0x0000FFFF];
|
|
430 dna++;
|
|
431 }
|
|
432
|
|
433 if (charToCount > 0) {
|
|
434 c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
|
|
435 sum += dnaDecodeTable[c >> 16];
|
|
436 sum += dnaDecodeTable[c & 0xFFFF];
|
|
437 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
|
|
438 }
|
|
439
|
|
440 occCount[0] += sum & 0x000000FF; sum >>= 8;
|
|
441 occCount[1] += sum & 0x000000FF; sum >>= 8;
|
|
442 occCount[2] += sum & 0x000000FF; sum >>= 8;
|
|
443 occCount[3] += sum;
|
|
444 }
|
|
445
|
|
446 static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar,
|
|
447 const unsigned int *cumulativeCount, const unsigned int *packedShift) {
|
|
448
|
|
449 unsigned int i, c, r;
|
|
450 unsigned int previousRank, currentRank;
|
|
451 unsigned int wordIndex, charIndex;
|
|
452 unsigned int inverseSa0;
|
|
453
|
|
454 inverseSa0 = previousRank = relativeRank[0];
|
|
455
|
|
456 for (i=1; i<=numChar; i++) {
|
|
457 currentRank = relativeRank[i];
|
|
458 // previousRank > cumulativeCount[c] because $ is one of the char
|
|
459 c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2])
|
|
460 + (previousRank > cumulativeCount[3]);
|
|
461 // set bwt for currentRank
|
|
462 if (c > 0) {
|
|
463 // c <> 'a'
|
|
464 r = currentRank;
|
|
465 if (r > inverseSa0) {
|
|
466 // - 1 because $ at inverseSa0 is not encoded
|
|
467 r--;
|
|
468 }
|
|
469 wordIndex = r / CHAR_PER_WORD;
|
|
470 charIndex = r - wordIndex * CHAR_PER_WORD;
|
|
471 bwt[wordIndex] |= c << packedShift[charIndex];
|
|
472 }
|
|
473 previousRank = currentRank;
|
|
474 }
|
|
475 }
|
|
476
|
|
477 static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit,
|
|
478 const unsigned int character)
|
|
479 {
|
|
480 unsigned int occIndexMajor;
|
|
481
|
|
482 occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
|
|
483
|
|
484 if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
|
|
485 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
|
|
486 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
|
|
487
|
|
488 } else {
|
|
489 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
|
|
490 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
|
|
491 }
|
|
492 }
|
|
493
|
|
494
|
|
495 static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
|
|
496 const unsigned int* dnaDecodeTable)
|
|
497 {
|
|
498 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
|
|
499 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
|
|
500 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
|
|
501 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
|
|
502
|
|
503 unsigned int wordToCount, charToCount;
|
|
504 unsigned int i, c;
|
|
505 unsigned int sum = 0;
|
|
506
|
|
507 wordToCount = index / 16;
|
|
508 charToCount = index - wordToCount * 16;
|
|
509
|
|
510 for (i=0; i<wordToCount; i++) {
|
|
511 sum += dnaDecodeTable[dna[i] >> 16];
|
|
512 sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
|
|
513 }
|
|
514
|
|
515 if (charToCount > 0) {
|
|
516 c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c;
|
|
517 sum += dnaDecodeTable[c >> 16];
|
|
518 sum += dnaDecodeTable[c & 0xFFFF];
|
|
519 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
|
|
520 }
|
|
521
|
|
522 return (sum >> (character * 8)) & 0x000000FF;
|
|
523
|
|
524 }
|
|
525
|
|
526 static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character,
|
|
527 const unsigned int* dnaDecodeTable)
|
|
528 {
|
|
529 static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
|
|
530 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
|
|
531 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
|
|
532 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
|
|
533
|
|
534 unsigned int wordToCount, charToCount;
|
|
535 unsigned int i, c;
|
|
536 unsigned int sum = 0;
|
|
537
|
|
538 wordToCount = index / 16;
|
|
539 charToCount = index - wordToCount * 16;
|
|
540
|
|
541 dna -= wordToCount + 1;
|
|
542
|
|
543 if (charToCount > 0) {
|
|
544 c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c;
|
|
545 sum += dnaDecodeTable[c >> 16];
|
|
546 sum += dnaDecodeTable[c & 0xFFFF];
|
|
547 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess
|
|
548 }
|
|
549
|
|
550 for (i=0; i<wordToCount; i++) {
|
|
551 dna++;
|
|
552 sum += dnaDecodeTable[*dna >> 16];
|
|
553 sum += dnaDecodeTable[*dna & 0x0000FFFF];
|
|
554 }
|
|
555
|
|
556 return (sum >> (character * 8)) & 0x000000FF;
|
|
557
|
|
558 }
|
|
559
|
|
560 unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) {
|
|
561
|
|
562 unsigned int occValue;
|
|
563 unsigned int occExplicitIndex, occIndex;
|
|
564
|
|
565 // $ is supposed to be positioned at inverseSa0 but it is not encoded
|
|
566 // therefore index is subtracted by 1 for adjustment
|
|
567 if (index > bwt->inverseSa0) {
|
|
568 index--;
|
|
569 }
|
|
570
|
|
571 occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding
|
|
572 occIndex = occExplicitIndex * OCC_INTERVAL;
|
|
573 occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
|
|
574
|
|
575 if (occIndex == index) {
|
|
576 return occValue;
|
|
577 }
|
|
578
|
|
579 if (occIndex < index) {
|
|
580 return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
|
|
581 } else {
|
|
582 return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
|
|
583 }
|
|
584
|
|
585 }
|
|
586
|
|
587 static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq,
|
|
588 const unsigned int *packedText, const unsigned int numChar,
|
|
589 const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration)
|
|
590 {
|
|
591 unsigned int saIndex;
|
|
592 unsigned int lastWord;
|
|
593 unsigned int packedMask;
|
|
594 unsigned int i, j;
|
|
595 unsigned int c, t;
|
|
596 unsigned int rankIndex;
|
|
597 unsigned int shift;
|
|
598 unsigned int seqIndexFromStart[ALPHABET_SIZE];
|
|
599 unsigned int seqIndexFromEnd[ALPHABET_SIZE];
|
|
600
|
|
601 for (i=0; i<ALPHABET_SIZE; i++) {
|
|
602 seqIndexFromStart[i] = cumulativeCount[i];
|
|
603 seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
|
|
604 }
|
|
605
|
|
606 shift = BITS_IN_WORD - BIT_PER_CHAR;
|
|
607 packedMask = ALL_ONE_MASK >> shift;
|
|
608 saIndex = bwt->inverseSa0;
|
|
609 rankIndex = numChar - 1;
|
|
610
|
|
611 lastWord = numChar / CHAR_PER_WORD;
|
|
612 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0
|
|
613 t = packedText[i];
|
|
614 for (j=0; j<CHAR_PER_WORD; j++) {
|
|
615 c = t & packedMask;
|
|
616 saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
|
|
617 // A counting sort using the first character of suffix is done here
|
|
618 // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
|
|
619 if (saIndex > bwt->inverseSa0) {
|
|
620 seq[seqIndexFromEnd[c]] = rankIndex;
|
|
621 absoluteRank[seqIndexFromEnd[c]] = saIndex;
|
|
622 seqIndexFromEnd[c]--;
|
|
623 } else {
|
|
624 seq[seqIndexFromStart[c]] = rankIndex;
|
|
625 absoluteRank[seqIndexFromStart[c]] = saIndex;
|
|
626 seqIndexFromStart[c]++;
|
|
627 }
|
|
628 rankIndex--;
|
|
629 t >>= BIT_PER_CHAR;
|
|
630 }
|
|
631 }
|
|
632
|
|
633 absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters
|
|
634 seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
|
|
635
|
|
636 return seqIndexFromStart[firstCharInLastIteration];
|
|
637 }
|
|
638
|
|
639 static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem)
|
|
640 {
|
|
641 #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
|
|
642
|
|
643 int lowIndex, highIndex, midIndex;
|
|
644 int lowPartitionIndex, highPartitionIndex;
|
|
645 int lowStack[32], highStack[32];
|
|
646 int stackDepth;
|
|
647 int i, j;
|
|
648 unsigned int tempSeq, tempKey;
|
|
649 int numberOfEqualKey;
|
|
650
|
|
651 if (numItem < 2) return;
|
|
652
|
|
653 stackDepth = 0;
|
|
654
|
|
655 lowIndex = 0;
|
|
656 highIndex = numItem - 1;
|
|
657
|
|
658 for (;;) {
|
|
659
|
|
660 for (;;) {
|
|
661
|
|
662 // Sort small array of data
|
|
663 if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays
|
|
664 for (i=lowIndex+1; i<=highIndex; i++) {
|
|
665 tempSeq = seq[i];
|
|
666 tempKey = key[i];
|
|
667 for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
|
|
668 seq[j] = seq[j-1];
|
|
669 key[j] = key[j-1];
|
|
670 }
|
|
671 if (j != i) {
|
|
672 seq[j] = tempSeq;
|
|
673 key[j] = tempKey;
|
|
674 }
|
|
675 }
|
|
676 break;
|
|
677 }
|
|
678
|
|
679 // Choose pivot as median of the lowest, middle, and highest data; sort the three data
|
|
680
|
|
681 midIndex = average(lowIndex, highIndex);
|
|
682 if (key[lowIndex] > key[midIndex]) {
|
|
683 tempSeq = seq[lowIndex];
|
|
684 tempKey = key[lowIndex];
|
|
685 seq[lowIndex] = seq[midIndex];
|
|
686 key[lowIndex] = key[midIndex];
|
|
687 seq[midIndex] = tempSeq;
|
|
688 key[midIndex] = tempKey;
|
|
689 }
|
|
690 if (key[lowIndex] > key[highIndex]) {
|
|
691 tempSeq = seq[lowIndex];
|
|
692 tempKey = key[lowIndex];
|
|
693 seq[lowIndex] = seq[highIndex];
|
|
694 key[lowIndex] = key[highIndex];
|
|
695 seq[highIndex] = tempSeq;
|
|
696 key[highIndex] = tempKey;
|
|
697 }
|
|
698 if (key[midIndex] > key[highIndex]) {
|
|
699 tempSeq = seq[midIndex];
|
|
700 tempKey = key[midIndex];
|
|
701 seq[midIndex] = seq[highIndex];
|
|
702 key[midIndex] = key[highIndex];
|
|
703 seq[highIndex] = tempSeq;
|
|
704 key[highIndex] = tempKey;
|
|
705 }
|
|
706
|
|
707 // Partition data
|
|
708
|
|
709 numberOfEqualKey = 0;
|
|
710
|
|
711 lowPartitionIndex = lowIndex + 1;
|
|
712 highPartitionIndex = highIndex - 1;
|
|
713
|
|
714 for (;;) {
|
|
715 while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
|
|
716 numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
|
|
717 lowPartitionIndex++;
|
|
718 }
|
|
719 while (lowPartitionIndex < highPartitionIndex) {
|
|
720 if (key[midIndex] >= key[highPartitionIndex]) {
|
|
721 numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
|
|
722 break;
|
|
723 }
|
|
724 highPartitionIndex--;
|
|
725 }
|
|
726 if (lowPartitionIndex >= highPartitionIndex) {
|
|
727 break;
|
|
728 }
|
|
729 tempSeq = seq[lowPartitionIndex];
|
|
730 tempKey = key[lowPartitionIndex];
|
|
731 seq[lowPartitionIndex] = seq[highPartitionIndex];
|
|
732 key[lowPartitionIndex] = key[highPartitionIndex];
|
|
733 seq[highPartitionIndex] = tempSeq;
|
|
734 key[highPartitionIndex] = tempKey;
|
|
735 if (highPartitionIndex == midIndex) {
|
|
736 // partition key has been moved
|
|
737 midIndex = lowPartitionIndex;
|
|
738 }
|
|
739 lowPartitionIndex++;
|
|
740 highPartitionIndex--;
|
|
741 }
|
|
742
|
|
743 // Adjust the partition index
|
|
744 highPartitionIndex = lowPartitionIndex;
|
|
745 lowPartitionIndex--;
|
|
746
|
|
747 // move the partition key to end of low partition
|
|
748 tempSeq = seq[midIndex];
|
|
749 tempKey = key[midIndex];
|
|
750 seq[midIndex] = seq[lowPartitionIndex];
|
|
751 key[midIndex] = key[lowPartitionIndex];
|
|
752 seq[lowPartitionIndex] = tempSeq;
|
|
753 key[lowPartitionIndex] = tempKey;
|
|
754
|
|
755 if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
|
|
756
|
|
757 // Many keys = partition key; separate the equal key data from the lower partition
|
|
758
|
|
759 midIndex = lowIndex;
|
|
760
|
|
761 for (;;) {
|
|
762 while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
|
|
763 midIndex++;
|
|
764 }
|
|
765 while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
|
|
766 lowPartitionIndex--;
|
|
767 }
|
|
768 if (midIndex >= lowPartitionIndex) {
|
|
769 break;
|
|
770 }
|
|
771 tempSeq = seq[midIndex];
|
|
772 tempKey = key[midIndex];
|
|
773 seq[midIndex] = seq[lowPartitionIndex - 1];
|
|
774 key[midIndex] = key[lowPartitionIndex - 1];
|
|
775 seq[lowPartitionIndex - 1] = tempSeq;
|
|
776 key[lowPartitionIndex - 1] = tempKey;
|
|
777 midIndex++;
|
|
778 lowPartitionIndex--;
|
|
779 }
|
|
780
|
|
781 }
|
|
782
|
|
783 if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
|
|
784 // put the larger partition to stack
|
|
785 lowStack[stackDepth] = lowIndex;
|
|
786 highStack[stackDepth] = lowPartitionIndex - 1;
|
|
787 stackDepth++;
|
|
788 // sort the smaller partition first
|
|
789 lowIndex = highPartitionIndex;
|
|
790 } else {
|
|
791 // put the larger partition to stack
|
|
792 lowStack[stackDepth] = highPartitionIndex;
|
|
793 highStack[stackDepth] = highIndex;
|
|
794 stackDepth++;
|
|
795 // sort the smaller partition first
|
|
796 if (lowPartitionIndex > lowIndex) {
|
|
797 highIndex = lowPartitionIndex - 1;
|
|
798 } else {
|
|
799 // all keys in the partition equals to the partition key
|
|
800 break;
|
|
801 }
|
|
802 }
|
|
803 continue;
|
|
804 }
|
|
805
|
|
806 // Pop a range from stack
|
|
807 if (stackDepth > 0) {
|
|
808 stackDepth--;
|
|
809 lowIndex = lowStack[stackDepth];
|
|
810 highIndex = highStack[stackDepth];
|
|
811 continue;
|
|
812 } else return;
|
|
813 }
|
|
814 }
|
|
815
|
|
816
|
|
817 static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq,
|
|
818 unsigned int* __restrict relativeRank, const unsigned int numItem,
|
|
819 unsigned int oldInverseSa0, const unsigned int *cumulativeCount)
|
|
820 {
|
|
821 unsigned int i, c;
|
|
822 unsigned int s, r;
|
|
823 unsigned int lastRank, lastIndex;
|
|
824 unsigned int oldInverseSa0RelativeRank = 0;
|
|
825 unsigned int freq;
|
|
826
|
|
827 lastIndex = numItem;
|
|
828 lastRank = sortedRank[numItem];
|
|
829 if (lastRank > oldInverseSa0) {
|
|
830 sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt
|
|
831 }
|
|
832 s = seq[numItem];
|
|
833 relativeRank[s] = numItem;
|
|
834 if (lastRank == oldInverseSa0) {
|
|
835 oldInverseSa0RelativeRank = numItem;
|
|
836 oldInverseSa0++; // so that this segment of code is not run again
|
|
837 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
|
|
838 }
|
|
839
|
|
840 c = ALPHABET_SIZE - 1;
|
|
841 freq = cumulativeCount[c];
|
|
842
|
|
843 for (i=numItem; i--;) { // from numItem - 1 to 0
|
|
844 r = sortedRank[i];
|
|
845 if (r > oldInverseSa0) {
|
|
846 sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt
|
|
847 }
|
|
848 s = seq[i];
|
|
849 if (i < freq) {
|
|
850 if (lastIndex >= freq) {
|
|
851 lastRank++; // to trigger the group across alphabet boundary to be split
|
|
852 }
|
|
853 c--;
|
|
854 freq = cumulativeCount[c];
|
|
855 }
|
|
856 if (r == lastRank) {
|
|
857 relativeRank[s] = lastIndex;
|
|
858 } else {
|
|
859 if (i == lastIndex - 1) {
|
|
860 if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) {
|
|
861 seq[lastIndex] = seq[lastIndex + 1] - 1;
|
|
862 } else {
|
|
863 seq[lastIndex] = (unsigned int)-1;
|
|
864 }
|
|
865 }
|
|
866 lastIndex = i;
|
|
867 lastRank = r;
|
|
868 relativeRank[s] = i;
|
|
869 if (r == oldInverseSa0) {
|
|
870 oldInverseSa0RelativeRank = i;
|
|
871 oldInverseSa0++; // so that this segment of code is not run again
|
|
872 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item
|
|
873 }
|
|
874 }
|
|
875 }
|
|
876
|
|
877 }
|
|
878
|
|
879 static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar,
|
|
880 const unsigned int *cumulativeCount)
|
|
881 {
|
|
882 unsigned int i, c;
|
|
883 unsigned int previousRank, currentRank;
|
|
884
|
|
885 previousRank = relativeRank[0];
|
|
886
|
|
887 for (i=1; i<=numChar; i++) {
|
|
888 currentRank = relativeRank[i];
|
|
889 c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
|
|
890 + (previousRank >= cumulativeCount[3]);
|
|
891 seq[currentRank] = c;
|
|
892 previousRank = currentRank;
|
|
893 }
|
|
894 }
|
|
895
|
|
896 static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
|
|
897 unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt)
|
|
898 {
|
|
899 unsigned int bitsInWordMinusBitPerChar;
|
|
900 unsigned int leftShift, rightShift;
|
|
901 unsigned int o;
|
|
902 unsigned int oIndex, iIndex, mIndex;
|
|
903 unsigned int mWord, mChar, oWord, oChar;
|
|
904 unsigned int numInsert;
|
|
905
|
|
906 bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
|
|
907
|
|
908 oIndex = 0;
|
|
909 iIndex = 0;
|
|
910 mIndex = 0;
|
|
911
|
|
912 mWord = 0;
|
|
913 mChar = 0;
|
|
914
|
|
915 mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration
|
|
916
|
|
917 while (oIndex < numOldBwt) {
|
|
918
|
|
919 // copy from insertBwt
|
|
920 while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
|
|
921 if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0
|
|
922 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
|
|
923 mIndex++;
|
|
924 mChar++;
|
|
925 if (mChar == CHAR_PER_WORD) {
|
|
926 mChar = 0;
|
|
927 mWord++;
|
|
928 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
|
|
929 }
|
|
930 }
|
|
931 iIndex++;
|
|
932 }
|
|
933
|
|
934 // Copy from oldBwt to mergedBwt
|
|
935 if (iIndex <= numInsertBwt) {
|
|
936 o = sortedRank[iIndex];
|
|
937 } else {
|
|
938 o = numOldBwt;
|
|
939 }
|
|
940 numInsert = o - oIndex;
|
|
941
|
|
942 oWord = oIndex / CHAR_PER_WORD;
|
|
943 oChar = oIndex - oWord * CHAR_PER_WORD;
|
|
944 if (oChar > mChar) {
|
|
945 leftShift = (oChar - mChar) * BIT_PER_CHAR;
|
|
946 rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
|
|
947 mergedBwt[mWord] = mergedBwt[mWord]
|
|
948 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
|
|
949 | (oldBwt[oWord+1] >> rightShift);
|
|
950 oIndex += min(numInsert, CHAR_PER_WORD - mChar);
|
|
951 while (o > oIndex) {
|
|
952 oWord++;
|
|
953 mWord++;
|
|
954 mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
|
|
955 oIndex += CHAR_PER_WORD;
|
|
956 }
|
|
957 } else if (oChar < mChar) {
|
|
958 rightShift = (mChar - oChar) * BIT_PER_CHAR;
|
|
959 leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
|
|
960 mergedBwt[mWord] = mergedBwt[mWord]
|
|
961 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
|
|
962 oIndex += min(numInsert, CHAR_PER_WORD - mChar);
|
|
963 while (o > oIndex) {
|
|
964 oWord++;
|
|
965 mWord++;
|
|
966 mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
|
|
967 oIndex += CHAR_PER_WORD;
|
|
968 }
|
|
969 } else { // oChar == mChar
|
|
970 mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
|
|
971 oIndex += min(numInsert, CHAR_PER_WORD - mChar);
|
|
972 while (o > oIndex) {
|
|
973 oWord++;
|
|
974 mWord++;
|
|
975 mergedBwt[mWord] = oldBwt[oWord];
|
|
976 oIndex += CHAR_PER_WORD;
|
|
977 }
|
|
978 }
|
|
979 oIndex = o;
|
|
980 mIndex += numInsert;
|
|
981
|
|
982 // Clear the trailing garbage in mergedBwt
|
|
983 mWord = mIndex / CHAR_PER_WORD;
|
|
984 mChar = mIndex - mWord * CHAR_PER_WORD;
|
|
985 if (mChar == 0) {
|
|
986 mergedBwt[mWord] = 0;
|
|
987 } else {
|
|
988 mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
|
|
989 }
|
|
990
|
|
991 }
|
|
992
|
|
993 // copy from insertBwt
|
|
994 while (iIndex <= numInsertBwt) {
|
|
995 if (sortedRank[iIndex] != 0) {
|
|
996 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
|
|
997 mIndex++;
|
|
998 mChar++;
|
|
999 if (mChar == CHAR_PER_WORD) {
|
|
1000 mChar = 0;
|
|
1001 mWord++;
|
|
1002 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary
|
|
1003 }
|
|
1004 }
|
|
1005 iIndex++;
|
|
1006 }
|
|
1007 }
|
|
1008
|
|
1009 void BWTClearTrailingBwtCode(BWT *bwt)
|
|
1010 {
|
|
1011 unsigned int bwtResidentSizeInWord;
|
|
1012 unsigned int wordIndex, offset;
|
|
1013 unsigned int i;
|
|
1014
|
|
1015 bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
|
|
1016
|
|
1017 wordIndex = bwt->textLength / CHAR_PER_WORD;
|
|
1018 offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
|
|
1019 if (offset > 0) {
|
|
1020 bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
|
|
1021 } else {
|
|
1022 if (wordIndex < bwtResidentSizeInWord) {
|
|
1023 bwt->bwtCode[wordIndex] = 0;
|
|
1024 }
|
|
1025 }
|
|
1026
|
|
1027 for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
|
|
1028 bwt->bwtCode[i] = 0;
|
|
1029 }
|
|
1030 }
|
|
1031
|
|
1032
|
|
1033 void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue,
|
|
1034 unsigned int* __restrict occValueMajor,
|
|
1035 const unsigned int textLength, const unsigned int* decodeTable)
|
|
1036 {
|
|
1037 unsigned int numberOfOccValueMajor, numberOfOccValue;
|
|
1038 unsigned int wordBetweenOccValue;
|
|
1039 unsigned int numberOfOccIntervalPerMajor;
|
|
1040 unsigned int c;
|
|
1041 unsigned int i, j;
|
|
1042 unsigned int occMajorIndex;
|
|
1043 unsigned int occIndex, bwtIndex;
|
|
1044 unsigned int sum;
|
|
1045 unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
|
|
1046
|
|
1047 wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
|
|
1048
|
|
1049 // Calculate occValue
|
|
1050 // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256
|
|
1051 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
|
|
1052 numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
|
|
1053 numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
|
|
1054
|
|
1055 tempOccValue0[0] = 0;
|
|
1056 tempOccValue0[1] = 0;
|
|
1057 tempOccValue0[2] = 0;
|
|
1058 tempOccValue0[3] = 0;
|
|
1059 occValueMajor[0] = 0;
|
|
1060 occValueMajor[1] = 0;
|
|
1061 occValueMajor[2] = 0;
|
|
1062 occValueMajor[3] = 0;
|
|
1063
|
|
1064 occIndex = 0;
|
|
1065 bwtIndex = 0;
|
|
1066 for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
|
|
1067
|
|
1068 for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
|
|
1069
|
|
1070 sum = 0;
|
|
1071 tempOccValue1[0] = tempOccValue0[0];
|
|
1072 tempOccValue1[1] = tempOccValue0[1];
|
|
1073 tempOccValue1[2] = tempOccValue0[2];
|
|
1074 tempOccValue1[3] = tempOccValue0[3];
|
|
1075
|
|
1076 for (j=0; j<wordBetweenOccValue; j++) {
|
|
1077 c = bwt[bwtIndex];
|
|
1078 sum += decodeTable[c >> 16];
|
|
1079 sum += decodeTable[c & 0x0000FFFF];
|
|
1080 bwtIndex++;
|
|
1081 }
|
|
1082 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
|
1083 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
|
|
1084 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
|
|
1085 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
|
|
1086 tempOccValue1[3] += sum;
|
|
1087 } else {
|
|
1088 if (sum == 0x00000100) {
|
|
1089 tempOccValue1[0] += 256;
|
|
1090 } else if (sum == 0x00010000) {
|
|
1091 tempOccValue1[1] += 256;
|
|
1092 } else if (sum == 0x01000000) {
|
|
1093 tempOccValue1[2] += 256;
|
|
1094 } else {
|
|
1095 tempOccValue1[3] += 256;
|
|
1096 }
|
|
1097 }
|
|
1098 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
|
|
1099 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
|
|
1100 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
|
|
1101 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
|
|
1102 tempOccValue0[0] = tempOccValue1[0];
|
|
1103 tempOccValue0[1] = tempOccValue1[1];
|
|
1104 tempOccValue0[2] = tempOccValue1[2];
|
|
1105 tempOccValue0[3] = tempOccValue1[3];
|
|
1106 sum = 0;
|
|
1107
|
|
1108 occIndex++;
|
|
1109
|
|
1110 for (j=0; j<wordBetweenOccValue; j++) {
|
|
1111 c = bwt[bwtIndex];
|
|
1112 sum += decodeTable[c >> 16];
|
|
1113 sum += decodeTable[c & 0x0000FFFF];
|
|
1114 bwtIndex++;
|
|
1115 }
|
|
1116 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
|
1117 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
|
|
1118 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
|
|
1119 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
|
|
1120 tempOccValue0[3] += sum;
|
|
1121 } else {
|
|
1122 if (sum == 0x00000100) {
|
|
1123 tempOccValue0[0] += 256;
|
|
1124 } else if (sum == 0x00010000) {
|
|
1125 tempOccValue0[1] += 256;
|
|
1126 } else if (sum == 0x01000000) {
|
|
1127 tempOccValue0[2] += 256;
|
|
1128 } else {
|
|
1129 tempOccValue0[3] += 256;
|
|
1130 }
|
|
1131 }
|
|
1132 }
|
|
1133
|
|
1134 occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
|
|
1135 occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
|
|
1136 occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
|
|
1137 occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
|
|
1138 tempOccValue0[0] = 0;
|
|
1139 tempOccValue0[1] = 0;
|
|
1140 tempOccValue0[2] = 0;
|
|
1141 tempOccValue0[3] = 0;
|
|
1142
|
|
1143 }
|
|
1144
|
|
1145 while (occIndex < (numberOfOccValue-1)/2) {
|
|
1146 sum = 0;
|
|
1147 tempOccValue1[0] = tempOccValue0[0];
|
|
1148 tempOccValue1[1] = tempOccValue0[1];
|
|
1149 tempOccValue1[2] = tempOccValue0[2];
|
|
1150 tempOccValue1[3] = tempOccValue0[3];
|
|
1151 for (j=0; j<wordBetweenOccValue; j++) {
|
|
1152 c = bwt[bwtIndex];
|
|
1153 sum += decodeTable[c >> 16];
|
|
1154 sum += decodeTable[c & 0x0000FFFF];
|
|
1155 bwtIndex++;
|
|
1156 }
|
|
1157 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
|
1158 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
|
|
1159 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
|
|
1160 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
|
|
1161 tempOccValue1[3] += sum;
|
|
1162 } else {
|
|
1163 if (sum == 0x00000100) {
|
|
1164 tempOccValue1[0] += 256;
|
|
1165 } else if (sum == 0x00010000) {
|
|
1166 tempOccValue1[1] += 256;
|
|
1167 } else if (sum == 0x01000000) {
|
|
1168 tempOccValue1[2] += 256;
|
|
1169 } else {
|
|
1170 tempOccValue1[3] += 256;
|
|
1171 }
|
|
1172 }
|
|
1173 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
|
|
1174 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
|
|
1175 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
|
|
1176 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
|
|
1177 tempOccValue0[0] = tempOccValue1[0];
|
|
1178 tempOccValue0[1] = tempOccValue1[1];
|
|
1179 tempOccValue0[2] = tempOccValue1[2];
|
|
1180 tempOccValue0[3] = tempOccValue1[3];
|
|
1181 sum = 0;
|
|
1182 occIndex++;
|
|
1183
|
|
1184 for (j=0; j<wordBetweenOccValue; j++) {
|
|
1185 c = bwt[bwtIndex];
|
|
1186 sum += decodeTable[c >> 16];
|
|
1187 sum += decodeTable[c & 0x0000FFFF];
|
|
1188 bwtIndex++;
|
|
1189 }
|
|
1190 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
|
1191 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8;
|
|
1192 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8;
|
|
1193 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8;
|
|
1194 tempOccValue0[3] += sum;
|
|
1195 } else {
|
|
1196 if (sum == 0x00000100) {
|
|
1197 tempOccValue0[0] += 256;
|
|
1198 } else if (sum == 0x00010000) {
|
|
1199 tempOccValue0[1] += 256;
|
|
1200 } else if (sum == 0x01000000) {
|
|
1201 tempOccValue0[2] += 256;
|
|
1202 } else {
|
|
1203 tempOccValue0[3] += 256;
|
|
1204 }
|
|
1205 }
|
|
1206 }
|
|
1207
|
|
1208 sum = 0;
|
|
1209 tempOccValue1[0] = tempOccValue0[0];
|
|
1210 tempOccValue1[1] = tempOccValue0[1];
|
|
1211 tempOccValue1[2] = tempOccValue0[2];
|
|
1212 tempOccValue1[3] = tempOccValue0[3];
|
|
1213
|
|
1214 if (occIndex * 2 < numberOfOccValue - 1) {
|
|
1215 for (j=0; j<wordBetweenOccValue; j++) {
|
|
1216 c = bwt[bwtIndex];
|
|
1217 sum += decodeTable[c >> 16];
|
|
1218 sum += decodeTable[c & 0x0000FFFF];
|
|
1219 bwtIndex++;
|
|
1220 }
|
|
1221 if (!DNA_OCC_SUM_EXCEPTION(sum)) {
|
|
1222 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8;
|
|
1223 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8;
|
|
1224 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8;
|
|
1225 tempOccValue1[3] += sum;
|
|
1226 } else {
|
|
1227 if (sum == 0x00000100) {
|
|
1228 tempOccValue1[0] += 256;
|
|
1229 } else if (sum == 0x00010000) {
|
|
1230 tempOccValue1[1] += 256;
|
|
1231 } else if (sum == 0x01000000) {
|
|
1232 tempOccValue1[2] += 256;
|
|
1233 } else {
|
|
1234 tempOccValue1[3] += 256;
|
|
1235 }
|
|
1236 }
|
|
1237 }
|
|
1238
|
|
1239 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
|
|
1240 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
|
|
1241 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
|
|
1242 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
|
|
1243
|
|
1244 }
|
|
1245
|
|
1246 static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar)
|
|
1247 {
|
|
1248 unsigned int i;
|
|
1249 unsigned int mergedBwtSizeInWord, mergedOccSizeInWord;
|
|
1250 unsigned int firstCharInThisIteration;
|
|
1251
|
|
1252 unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt;
|
|
1253 unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
|
|
1254
|
|
1255 #ifdef DEBUG
|
|
1256 if (numChar > bwtInc->buildSize) {
|
|
1257 fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n");
|
|
1258 exit(1);
|
|
1259 }
|
|
1260 #endif
|
|
1261
|
|
1262 mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
|
|
1263 mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
|
|
1264
|
|
1265 initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
|
|
1266
|
|
1267 if (bwtInc->bwt->textLength == 0) { // Initial build
|
|
1268
|
|
1269 // Set address
|
|
1270 seq = bwtInc->workingMemory;
|
|
1271 relativeRank = seq + bwtInc->buildSize + 1;
|
|
1272 mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place
|
|
1273
|
|
1274 BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
|
|
1275
|
|
1276 firstCharInThisIteration = relativeRank[0];
|
|
1277 relativeRank[numChar] = 0;
|
|
1278
|
|
1279 // Sort suffix
|
|
1280 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE);
|
|
1281 newInverseSa0 = relativeRank[0];
|
|
1282
|
|
1283 // Clear BWT area
|
|
1284 initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
|
|
1285
|
|
1286 // Build BWT
|
|
1287 BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
|
|
1288
|
|
1289 // so that the cumulativeCount is not deducted
|
|
1290 bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
|
|
1291
|
|
1292 } else { // Incremental build
|
|
1293 // Set address
|
|
1294 sortedRank = bwtInc->workingMemory;
|
|
1295 seq = sortedRank + bwtInc->buildSize + 1;
|
|
1296 insertBwt = seq;
|
|
1297 relativeRank = seq + bwtInc->buildSize + 1;
|
|
1298
|
|
1299 // Store the first character of this iteration
|
|
1300 firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
|
|
1301
|
|
1302 // Count occurrence of input text
|
|
1303 ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
|
|
1304 // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
|
|
1305 bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
|
|
1306 bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
|
|
1307 bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
|
|
1308 bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
|
|
1309
|
|
1310 // Get rank of new suffix among processed suffix
|
|
1311 // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
|
|
1312 oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText,
|
|
1313 numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
|
|
1314
|
|
1315 // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
|
|
1316 for (i=0; i<ALPHABET_SIZE; i++) {
|
|
1317 if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
|
|
1318 bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
|
|
1319 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
|
|
1320 } else {
|
|
1321 if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
|
|
1322 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
|
|
1323 }
|
|
1324 if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
|
|
1325 BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
|
|
1326 }
|
|
1327 }
|
|
1328 }
|
|
1329
|
|
1330 // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
|
|
1331 // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
|
|
1332 BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
|
|
1333 #ifdef DEBUG
|
|
1334 if (relativeRank[numChar] != oldInverseSa0RelativeRank) {
|
|
1335 fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n");
|
|
1336 exit(1);
|
|
1337 }
|
|
1338 #endif
|
|
1339
|
|
1340 // Sort suffix
|
|
1341 QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE);
|
|
1342
|
|
1343 newInverseSa0RelativeRank = relativeRank[0];
|
|
1344 newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
|
|
1345
|
|
1346 sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt
|
|
1347
|
|
1348 // Build BWT
|
|
1349 BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
|
|
1350
|
|
1351 // Merge BWT
|
|
1352 mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord
|
|
1353 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR;
|
|
1354 // minus numberOfIteration * occInterval to create a buffer for merging
|
|
1355 BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
|
|
1356
|
|
1357 }
|
|
1358
|
|
1359 // Build auxiliary structure and update info and pointers in BWT
|
|
1360 bwtInc->bwt->textLength += numChar;
|
|
1361 bwtInc->bwt->bwtCode = mergedBwt;
|
|
1362 bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
|
|
1363 bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
|
|
1364 if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) {
|
|
1365 fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n");
|
|
1366 exit(1);
|
|
1367 }
|
|
1368
|
|
1369 bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
|
|
1370
|
|
1371 BWTClearTrailingBwtCode(bwtInc->bwt);
|
|
1372 BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
|
|
1373 bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
|
|
1374
|
|
1375 bwtInc->bwt->inverseSa0 = newInverseSa0;
|
|
1376
|
|
1377 bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
|
|
1378 bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
|
|
1379 bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
|
|
1380 bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
|
|
1381
|
|
1382 bwtInc->firstCharInLastIteration = firstCharInThisIteration;
|
|
1383
|
|
1384 // Set build size and text address for the next build
|
|
1385 BWTIncSetBuildSizeAndTextAddr(bwtInc);
|
|
1386 bwtInc->numberOfIterationDone++;
|
|
1387
|
|
1388 }
|
|
1389
|
|
1390 BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit,
|
|
1391 const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize)
|
|
1392 {
|
|
1393
|
|
1394 FILE *packedFile;
|
|
1395 unsigned int packedFileLen;
|
|
1396 unsigned int totalTextLength;
|
|
1397 unsigned int textToLoad, textSizeInByte;
|
|
1398 unsigned int processedTextLength;
|
|
1399 unsigned char lastByteLength;
|
|
1400
|
|
1401 BWTInc *bwtInc;
|
|
1402
|
|
1403 packedFile = (FILE*)fopen(inputFileName, "rb");
|
|
1404
|
|
1405 if (packedFile == NULL) {
|
|
1406 fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n");
|
|
1407 exit(1);
|
|
1408 }
|
|
1409
|
|
1410 fseek(packedFile, -1, SEEK_END);
|
|
1411 packedFileLen = ftell(packedFile);
|
|
1412 if ((int)packedFileLen < 0) {
|
|
1413 fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n");
|
|
1414 exit(1);
|
|
1415 }
|
|
1416 fread(&lastByteLength, sizeof(unsigned char), 1, packedFile);
|
|
1417 totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
|
|
1418
|
|
1419 bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize);
|
|
1420
|
|
1421 BWTIncSetBuildSizeAndTextAddr(bwtInc);
|
|
1422
|
|
1423 if (bwtInc->buildSize > totalTextLength) {
|
|
1424 textToLoad = totalTextLength;
|
|
1425 } else {
|
|
1426 textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
|
|
1427 }
|
|
1428 textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte
|
|
1429
|
|
1430 fseek(packedFile, -2, SEEK_CUR);
|
|
1431 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
|
1432 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile);
|
|
1433 fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR);
|
|
1434
|
|
1435 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
|
|
1436 BWTIncConstruct(bwtInc, textToLoad);
|
|
1437
|
|
1438 processedTextLength = textToLoad;
|
|
1439
|
|
1440 while (processedTextLength < totalTextLength) {
|
|
1441 textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
|
|
1442 if (textToLoad > totalTextLength - processedTextLength) {
|
|
1443 textToLoad = totalTextLength - processedTextLength;
|
|
1444 }
|
|
1445 textSizeInByte = textToLoad / CHAR_PER_BYTE;
|
|
1446 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
|
1447 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile);
|
|
1448 fseek(packedFile, -((int)textSizeInByte), SEEK_CUR);
|
|
1449 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
|
|
1450 BWTIncConstruct(bwtInc, textToLoad);
|
|
1451 processedTextLength += textToLoad;
|
|
1452 if (bwtInc->numberOfIterationDone % 10 == 0) {
|
|
1453 printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n",
|
|
1454 bwtInc->numberOfIterationDone, processedTextLength);
|
|
1455 }
|
|
1456 }
|
|
1457 return bwtInc;
|
|
1458 }
|
|
1459
|
|
1460 void BWTFree(BWT *bwt)
|
|
1461 {
|
|
1462 if (bwt == 0) return;
|
|
1463 free(bwt->cumulativeFreq);
|
|
1464 free(bwt->bwtCode);
|
|
1465 free(bwt->occValue);
|
|
1466 free(bwt->occValueMajor);
|
|
1467 free(bwt->saValue);
|
|
1468 free(bwt->inverseSa);
|
|
1469 free(bwt->decodeTable);
|
|
1470 free(bwt->saIndexRange);
|
|
1471 free(bwt->saValueOnBoundary);
|
|
1472 free(bwt);
|
|
1473 }
|
|
1474
|
|
1475 void BWTIncFree(BWTInc *bwtInc)
|
|
1476 {
|
|
1477 if (bwtInc == 0) return;
|
|
1478 free(bwtInc->bwt);
|
|
1479 free(bwtInc->workingMemory);
|
|
1480 free(bwtInc);
|
|
1481 }
|
|
1482
|
|
1483 static unsigned int BWTFileSizeInWord(const unsigned int numChar)
|
|
1484 {
|
|
1485 // The $ in BWT at the position of inverseSa0 is not encoded
|
|
1486 return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
|
|
1487 }
|
|
1488
|
|
1489 void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
|
|
1490 {
|
|
1491 FILE *bwtFile;
|
|
1492 /* FILE *occValueFile; */
|
|
1493 unsigned int bwtLength;
|
|
1494
|
|
1495 bwtFile = (FILE*)fopen(bwtFileName, "wb");
|
|
1496 if (bwtFile == NULL) {
|
|
1497 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n");
|
|
1498 exit(1);
|
|
1499 }
|
|
1500
|
|
1501 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile);
|
|
1502 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile);
|
|
1503 bwtLength = BWTFileSizeInWord(bwt->textLength);
|
|
1504 fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile);
|
|
1505 fclose(bwtFile);
|
|
1506 /*
|
|
1507 occValueFile = (FILE*)fopen(occValueFileName, "wb");
|
|
1508 if (occValueFile == NULL) {
|
|
1509 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n");
|
|
1510 exit(1);
|
|
1511 }
|
|
1512
|
|
1513 fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile);
|
|
1514 fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile);
|
|
1515 fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile);
|
|
1516 fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile);
|
|
1517 fclose(occValueFile);
|
|
1518 */
|
|
1519 }
|
|
1520
|
|
1521 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
|
|
1522 {
|
|
1523 BWTInc *bwtInc;
|
|
1524 bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000);
|
|
1525 printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
|
|
1526 BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
|
|
1527 BWTIncFree(bwtInc);
|
|
1528 }
|
|
1529
|
|
1530 int bwt_bwtgen_main(int argc, char *argv[])
|
|
1531 {
|
|
1532 if (argc < 3) {
|
|
1533 fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
|
|
1534 return 1;
|
|
1535 }
|
|
1536 bwt_bwtgen(argv[1], argv[2]);
|
|
1537 return 0;
|
|
1538 }
|
|
1539
|
|
1540 #ifdef MAIN_BWT_GEN
|
|
1541
|
|
1542 int main(int argc, char *argv[])
|
|
1543 {
|
|
1544 return bwt_bwtgen_main(argc, argv);
|
|
1545 }
|
|
1546
|
|
1547 #endif
|