Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/bwt_gen.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
comparison
equal
deleted
inserted
replaced
1:a9636dc1e99a | 2:a294fbfcb1db |
---|---|
1 /* | |
2 | |
3 BWTConstruct.c BWT-Index Construction | |
4 | |
5 This module constructs BWT and auxiliary data structures. | |
6 | |
7 Copyright (C) 2004, Wong Chi Kwong. | |
8 | |
9 This program is free software; you can redistribute it and/or | |
10 modify it under the terms of the GNU General Public License | |
11 as published by the Free Software Foundation; either version 2 | |
12 of the License, or (at your option) any later version. | |
13 | |
14 This program is distributed in the hope that it will be useful, | |
15 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 GNU General Public License for more details. | |
18 | |
19 You should have received a copy of the GNU General Public License | |
20 along with this program; if not, write to the Free Software | |
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
22 | |
23 */ | |
24 | |
25 #include <stdio.h> | |
26 #include <stdlib.h> | |
27 #include <string.h> | |
28 #include <assert.h> | |
29 #include <stdint.h> | |
30 #include "QSufSort.h" | |
31 | |
32 typedef uint64_t bgint_t; | |
33 typedef int64_t sbgint_t; | |
34 | |
35 #define ALPHABET_SIZE 4 | |
36 #define BIT_PER_CHAR 2 | |
37 #define CHAR_PER_WORD 16 | |
38 #define CHAR_PER_BYTE 4 | |
39 | |
40 #define BITS_IN_WORD 32 | |
41 #define BITS_IN_BYTE 8 | |
42 #define BYTES_IN_WORD 4 | |
43 | |
44 #define ALL_ONE_MASK 0xFFFFFFFF | |
45 #define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 | |
46 | |
47 #define BITS_PER_OCC_VALUE 16 | |
48 #define OCC_VALUE_PER_WORD 2 | |
49 #define OCC_INTERVAL 256 | |
50 #define OCC_INTERVAL_MAJOR 65536 | |
51 | |
52 #define TRUE 1 | |
53 #define FALSE 0 | |
54 | |
55 #define BWTINC_INSERT_SORT_NUM_ITEM 7 | |
56 | |
57 #define MIN_AVAILABLE_WORD 0x10000 | |
58 | |
59 #define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) | |
60 #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) | |
61 #define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) | |
62 #define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a)) | |
63 #define swap(a, b, t); t = a; a = b; b = t; | |
64 #define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) | |
65 #define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) | |
66 #define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) | |
67 | |
68 typedef struct BWT { | |
69 bgint_t textLength; // length of the text | |
70 bgint_t inverseSa0; // SA-1[0] | |
71 bgint_t *cumulativeFreq; // cumulative frequency | |
72 unsigned int *bwtCode; // BWT code | |
73 unsigned int *occValue; // Occurrence values stored explicitly | |
74 bgint_t *occValueMajor; // Occurrence values stored explicitly | |
75 unsigned int *decodeTable; // For decoding BWT by table lookup | |
76 bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated | |
77 bgint_t occSizeInWord; // Temporary variable to hold the memory allocated | |
78 bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated | |
79 } BWT; | |
80 | |
81 typedef struct BWTInc { | |
82 BWT *bwt; | |
83 unsigned int numberOfIterationDone; | |
84 bgint_t *cumulativeCountInCurrentBuild; | |
85 bgint_t availableWord; | |
86 bgint_t buildSize; | |
87 bgint_t initialMaxBuildSize; | |
88 bgint_t incMaxBuildSize; | |
89 unsigned int firstCharInLastIteration; | |
90 unsigned int *workingMemory; | |
91 unsigned int *packedText; | |
92 unsigned char *textBuffer; | |
93 unsigned int *packedShift; | |
94 } BWTInc; | |
95 | |
96 static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, | |
97 unsigned int lastByteLength) | |
98 { | |
99 return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; | |
100 } | |
101 | |
102 static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) | |
103 { | |
104 bgint_t i; | |
105 for (i=0; i<length; i++) startAddr[i] = initValue; | |
106 } | |
107 | |
108 static void initializeVAL_bg(bgint_t *startAddr, const bgint_t length, const bgint_t initValue) | |
109 { | |
110 bgint_t i; | |
111 for (i=0; i<length; i++) startAddr[i] = initValue; | |
112 } | |
113 | |
114 static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable) | |
115 { | |
116 unsigned int i, j, c, t; | |
117 | |
118 for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) { | |
119 dnaDecodeTable[i] = 0; | |
120 c = i; | |
121 for (j=0; j<8; j++) { | |
122 t = c & 0x00000003; | |
123 dnaDecodeTable[i] += 1 << (t * 8); | |
124 c >>= 2; | |
125 } | |
126 } | |
127 | |
128 } | |
129 // for BWTIncCreate() | |
130 static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar) | |
131 { | |
132 bgint_t numOfOccValue; | |
133 unsigned numOfOccIntervalPerMajor; | |
134 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
135 numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; | |
136 return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; | |
137 } | |
138 // for BWTIncCreate() | |
139 static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar) | |
140 { | |
141 bgint_t numOfOccValue; | |
142 numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
143 return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; | |
144 } | |
145 // for BWTIncCreate() | |
146 static bgint_t BWTResidentSizeInWord(const bgint_t numChar) { | |
147 | |
148 bgint_t numCharRoundUpToOccInterval; | |
149 | |
150 // The $ in BWT at the position of inverseSa0 is not encoded | |
151 numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; | |
152 | |
153 return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; | |
154 | |
155 } | |
156 | |
157 static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) | |
158 { | |
159 bgint_t maxBuildSize; | |
160 | |
161 if (bwtInc->bwt->textLength == 0) { | |
162 // initial build | |
163 // Minus 2 because n+1 entries of seq and rank needed for n char | |
164 maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4)) | |
165 / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); | |
166 if (bwtInc->initialMaxBuildSize > 0) { | |
167 bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); | |
168 } else { | |
169 bwtInc->buildSize = maxBuildSize; | |
170 } | |
171 } else { | |
172 // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char | |
173 // Minus numberOfIterationDone because bwt slightly shift to left in each iteration | |
174 maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord | |
175 - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) | |
176 / 3 / (sizeof(bgint_t) / 4); | |
177 if (maxBuildSize < CHAR_PER_WORD) { | |
178 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); | |
179 exit(1); | |
180 } | |
181 if (bwtInc->incMaxBuildSize > 0) { | |
182 bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); | |
183 } else { | |
184 bwtInc->buildSize = maxBuildSize; | |
185 } | |
186 if (bwtInc->buildSize < CHAR_PER_WORD) | |
187 bwtInc->buildSize = CHAR_PER_WORD; | |
188 } | |
189 | |
190 if (bwtInc->buildSize < CHAR_PER_WORD) { | |
191 fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); | |
192 exit(1); | |
193 } | |
194 | |
195 bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; | |
196 | |
197 bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); | |
198 bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); | |
199 } | |
200 | |
201 // for ceilLog2() | |
202 unsigned int leadingZero(const unsigned int input) | |
203 { | |
204 unsigned int l; | |
205 const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
206 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
207 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
208 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
209 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
210 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
211 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
212 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; | |
213 | |
214 if (input & 0xFFFF0000) { | |
215 if (input & 0xFF000000) { | |
216 l = leadingZero8bit[input >> 24]; | |
217 } else { | |
218 l = 8 + leadingZero8bit[input >> 16]; | |
219 } | |
220 } else { | |
221 if (input & 0x0000FF00) { | |
222 l = 16 + leadingZero8bit[input >> 8]; | |
223 } else { | |
224 l = 24 + leadingZero8bit[input]; | |
225 } | |
226 } | |
227 return l; | |
228 | |
229 } | |
230 // for BitPerBytePackedChar() | |
231 static unsigned int ceilLog2(const unsigned int input) | |
232 { | |
233 if (input <= 1) return 0; | |
234 return BITS_IN_WORD - leadingZero(input - 1); | |
235 | |
236 } | |
237 // for ConvertBytePackedToWordPacked() | |
238 static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) | |
239 { | |
240 unsigned int bitPerChar; | |
241 bitPerChar = ceilLog2(alphabetSize); | |
242 // Return the largest number of bit that does not affect packing efficiency | |
243 if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) | |
244 bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); | |
245 return bitPerChar; | |
246 } | |
247 // for ConvertBytePackedToWordPacked() | |
248 static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) | |
249 { | |
250 return ceilLog2(alphabetSize); | |
251 } | |
252 | |
253 static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, | |
254 const bgint_t textLength) | |
255 { | |
256 bgint_t i; | |
257 unsigned int j, k, c; | |
258 unsigned int bitPerBytePackedChar; | |
259 unsigned int bitPerWordPackedChar; | |
260 unsigned int charPerWord; | |
261 unsigned int charPerByte; | |
262 unsigned int bytePerIteration; | |
263 bgint_t byteProcessed = 0; | |
264 bgint_t wordProcessed = 0; | |
265 unsigned int mask, shift; | |
266 | |
267 unsigned int buffer[BITS_IN_WORD]; | |
268 | |
269 bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); | |
270 bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); | |
271 charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; | |
272 charPerWord = BITS_IN_WORD / bitPerWordPackedChar; | |
273 | |
274 bytePerIteration = charPerWord / charPerByte; | |
275 mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); | |
276 shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; | |
277 | |
278 while ((wordProcessed + 1) * charPerWord < textLength) { | |
279 | |
280 k = 0; | |
281 for (i=0; i<bytePerIteration; i++) { | |
282 c = (unsigned int)input[byteProcessed] << shift; | |
283 for (j=0; j<charPerByte; j++) { | |
284 buffer[k] = c & mask; | |
285 c <<= bitPerBytePackedChar; | |
286 k++; | |
287 } | |
288 byteProcessed++; | |
289 } | |
290 | |
291 c = 0; | |
292 for (i=0; i<charPerWord; i++) { | |
293 c |= buffer[i] >> bitPerWordPackedChar * i; | |
294 } | |
295 output[wordProcessed] = c; | |
296 wordProcessed++; | |
297 | |
298 } | |
299 | |
300 k = 0; | |
301 for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { | |
302 c = (unsigned int)input[byteProcessed] << shift; | |
303 for (j=0; j<charPerByte; j++) { | |
304 buffer[k] = c & mask; | |
305 c <<= bitPerBytePackedChar; | |
306 k++; | |
307 } | |
308 byteProcessed++; | |
309 } | |
310 | |
311 c = 0; | |
312 for (i=0; i<textLength - wordProcessed * charPerWord; i++) { | |
313 c |= buffer[i] >> bitPerWordPackedChar * i; | |
314 } | |
315 output[wordProcessed] = c; | |
316 } | |
317 | |
318 BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) | |
319 { | |
320 BWT *bwt; | |
321 | |
322 bwt = (BWT*)calloc(1, sizeof(BWT)); | |
323 | |
324 bwt->textLength = 0; | |
325 | |
326 bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); | |
327 initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); | |
328 | |
329 bwt->bwtSizeInWord = 0; | |
330 | |
331 // Generate decode tables | |
332 if (decodeTable == NULL) { | |
333 bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); | |
334 GenerateDNAOccCountTable(bwt->decodeTable); | |
335 } else { | |
336 bwt->decodeTable = decodeTable; | |
337 } | |
338 | |
339 bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); | |
340 bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); | |
341 | |
342 bwt->occSizeInWord = 0; | |
343 bwt->occValue = NULL; | |
344 | |
345 return bwt; | |
346 } | |
347 | |
348 BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize) | |
349 { | |
350 BWTInc *bwtInc; | |
351 unsigned int i, n_iter; | |
352 | |
353 if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; | |
354 if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; | |
355 | |
356 bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); | |
357 bwtInc->numberOfIterationDone = 0; | |
358 bwtInc->bwt = BWTCreate(textLength, NULL); | |
359 bwtInc->initialMaxBuildSize = initialMaxBuildSize; | |
360 bwtInc->incMaxBuildSize = incMaxBuildSize; | |
361 bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); | |
362 initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); | |
363 | |
364 // Build frequently accessed data | |
365 bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); | |
366 for (i=0; i<CHAR_PER_WORD; i++) | |
367 bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; | |
368 | |
369 n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1; | |
370 bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement | |
371 + OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array | |
372 + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration | |
373 if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small | |
374 fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); | |
375 bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); | |
376 | |
377 return bwtInc; | |
378 } | |
379 // for BWTIncConstruct() | |
380 static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, | |
381 bgint_t* __restrict cumulativeCount, const bgint_t numChar) | |
382 { | |
383 bgint_t i; | |
384 unsigned int j; | |
385 unsigned int c, t; | |
386 unsigned int packedMask; | |
387 bgint_t rankIndex; | |
388 bgint_t lastWord; | |
389 unsigned int numCharInLastWord; | |
390 | |
391 lastWord = (numChar - 1) / CHAR_PER_WORD; | |
392 numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; | |
393 | |
394 packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); | |
395 rankIndex = numChar - 1; | |
396 | |
397 t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); | |
398 for (i=0; i<numCharInLastWord; i++) { | |
399 c = t & packedMask; | |
400 cumulativeCount[c+1]++; | |
401 rank[rankIndex] = c; | |
402 rankIndex--; | |
403 t >>= BIT_PER_CHAR; | |
404 } | |
405 | |
406 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 | |
407 t = packedText[i]; | |
408 for (j=0; j<CHAR_PER_WORD; j++) { | |
409 c = t & packedMask; | |
410 cumulativeCount[c+1]++; | |
411 rank[rankIndex] = c; | |
412 rankIndex--; | |
413 t >>= BIT_PER_CHAR; | |
414 } | |
415 } | |
416 | |
417 // Convert occurrence to cumulativeCount | |
418 cumulativeCount[2] += cumulativeCount[1]; | |
419 cumulativeCount[3] += cumulativeCount[2]; | |
420 cumulativeCount[4] += cumulativeCount[3]; | |
421 } | |
422 | |
423 | |
424 static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, | |
425 bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) | |
426 { | |
427 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, | |
428 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, | |
429 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, | |
430 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; | |
431 | |
432 bgint_t iteration, i; | |
433 unsigned int wordToCount, charToCount; | |
434 unsigned int j, c, sum; | |
435 | |
436 occCount[0] = 0; | |
437 occCount[1] = 0; | |
438 occCount[2] = 0; | |
439 occCount[3] = 0; | |
440 | |
441 iteration = index / 256; | |
442 wordToCount = (index - iteration * 256) / 16; | |
443 charToCount = index - iteration * 256 - wordToCount * 16; | |
444 | |
445 for (i=0; i<iteration; i++) { | |
446 | |
447 sum = 0; | |
448 for (j=0; j<16; j++) { | |
449 sum += dnaDecodeTable[*dna >> 16]; | |
450 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
451 dna++; | |
452 } | |
453 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
454 occCount[0] += sum & 0x000000FF; sum >>= 8; | |
455 occCount[1] += sum & 0x000000FF; sum >>= 8; | |
456 occCount[2] += sum & 0x000000FF; sum >>= 8; | |
457 occCount[3] += sum; | |
458 } else { | |
459 // only some or all of the 3 bits are on | |
460 // in reality, only one of the four cases are possible | |
461 if (sum == 0x00000100) { | |
462 occCount[0] += 256; | |
463 } else if (sum == 0x00010000) { | |
464 occCount[1] += 256; | |
465 } else if (sum == 0x01000000) { | |
466 occCount[2] += 256; | |
467 } else if (sum == 0x00000000) { | |
468 occCount[3] += 256; | |
469 } else { | |
470 fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); | |
471 exit(1); | |
472 } | |
473 } | |
474 | |
475 } | |
476 | |
477 sum = 0; | |
478 for (j=0; j<wordToCount; j++) { | |
479 sum += dnaDecodeTable[*dna >> 16]; | |
480 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
481 dna++; | |
482 } | |
483 | |
484 if (charToCount > 0) { | |
485 c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; | |
486 sum += dnaDecodeTable[c >> 16]; | |
487 sum += dnaDecodeTable[c & 0xFFFF]; | |
488 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
489 } | |
490 | |
491 occCount[0] += sum & 0x000000FF; sum >>= 8; | |
492 occCount[1] += sum & 0x000000FF; sum >>= 8; | |
493 occCount[2] += sum & 0x000000FF; sum >>= 8; | |
494 occCount[3] += sum; | |
495 } | |
496 | |
497 static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, | |
498 const bgint_t *cumulativeCount, const unsigned int *packedShift) { | |
499 | |
500 bgint_t i, r; | |
501 unsigned int c; | |
502 bgint_t previousRank, currentRank; | |
503 bgint_t wordIndex, charIndex; | |
504 bgint_t inverseSa0; | |
505 | |
506 inverseSa0 = previousRank = relativeRank[0]; | |
507 | |
508 for (i=1; i<=numChar; i++) { | |
509 currentRank = relativeRank[i]; | |
510 // previousRank > cumulativeCount[c] because $ is one of the char | |
511 c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) | |
512 + (previousRank > cumulativeCount[3]); | |
513 // set bwt for currentRank | |
514 if (c > 0) { | |
515 // c <> 'a' | |
516 r = currentRank; | |
517 if (r > inverseSa0) { | |
518 // - 1 because $ at inverseSa0 is not encoded | |
519 r--; | |
520 } | |
521 wordIndex = r / CHAR_PER_WORD; | |
522 charIndex = r - wordIndex * CHAR_PER_WORD; | |
523 bwt[wordIndex] |= c << packedShift[charIndex]; | |
524 } | |
525 previousRank = currentRank; | |
526 } | |
527 } | |
528 | |
529 static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, | |
530 const unsigned int character) | |
531 { | |
532 bgint_t occIndexMajor; | |
533 | |
534 occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; | |
535 | |
536 if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { | |
537 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + | |
538 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); | |
539 | |
540 } else { | |
541 return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + | |
542 (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); | |
543 } | |
544 } | |
545 | |
546 | |
547 static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, | |
548 const unsigned int* dnaDecodeTable) | |
549 { | |
550 static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, | |
551 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, | |
552 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, | |
553 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; | |
554 | |
555 unsigned int wordToCount, charToCount; | |
556 unsigned int i, c; | |
557 unsigned int sum = 0; | |
558 | |
559 wordToCount = index / 16; | |
560 charToCount = index - wordToCount * 16; | |
561 | |
562 for (i=0; i<wordToCount; i++) { | |
563 sum += dnaDecodeTable[dna[i] >> 16]; | |
564 sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; | |
565 } | |
566 | |
567 if (charToCount > 0) { | |
568 c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; | |
569 sum += dnaDecodeTable[c >> 16]; | |
570 sum += dnaDecodeTable[c & 0xFFFF]; | |
571 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
572 } | |
573 | |
574 return (sum >> (character * 8)) & 0x000000FF; | |
575 | |
576 } | |
577 | |
578 static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, | |
579 const unsigned int* dnaDecodeTable) | |
580 { | |
581 static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, | |
582 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, | |
583 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, | |
584 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; | |
585 | |
586 unsigned int wordToCount, charToCount; | |
587 unsigned int i, c; | |
588 unsigned int sum = 0; | |
589 | |
590 wordToCount = index / 16; | |
591 charToCount = index - wordToCount * 16; | |
592 | |
593 dna -= wordToCount + 1; | |
594 | |
595 if (charToCount > 0) { | |
596 c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; | |
597 sum += dnaDecodeTable[c >> 16]; | |
598 sum += dnaDecodeTable[c & 0xFFFF]; | |
599 sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess | |
600 } | |
601 | |
602 for (i=0; i<wordToCount; i++) { | |
603 dna++; | |
604 sum += dnaDecodeTable[*dna >> 16]; | |
605 sum += dnaDecodeTable[*dna & 0x0000FFFF]; | |
606 } | |
607 | |
608 return (sum >> (character * 8)) & 0x000000FF; | |
609 | |
610 } | |
611 | |
612 bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) | |
613 { | |
614 bgint_t occValue; | |
615 bgint_t occExplicitIndex, occIndex; | |
616 | |
617 // $ is supposed to be positioned at inverseSa0 but it is not encoded | |
618 // therefore index is subtracted by 1 for adjustment | |
619 if (index > bwt->inverseSa0) | |
620 index--; | |
621 | |
622 occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding | |
623 occIndex = occExplicitIndex * OCC_INTERVAL; | |
624 occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); | |
625 | |
626 if (occIndex == index) | |
627 return occValue; | |
628 | |
629 if (occIndex < index) { | |
630 return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); | |
631 } else { | |
632 return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); | |
633 } | |
634 } | |
635 | |
636 static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, | |
637 const unsigned int *packedText, const bgint_t numChar, | |
638 const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) | |
639 { | |
640 bgint_t saIndex; | |
641 bgint_t lastWord; | |
642 unsigned int packedMask; | |
643 bgint_t i; | |
644 unsigned int c, t, j; | |
645 bgint_t rankIndex; | |
646 unsigned int shift; | |
647 bgint_t seqIndexFromStart[ALPHABET_SIZE]; | |
648 bgint_t seqIndexFromEnd[ALPHABET_SIZE]; | |
649 | |
650 for (i=0; i<ALPHABET_SIZE; i++) { | |
651 seqIndexFromStart[i] = cumulativeCount[i]; | |
652 seqIndexFromEnd[i] = cumulativeCount[i+1] - 1; | |
653 } | |
654 | |
655 shift = BITS_IN_WORD - BIT_PER_CHAR; | |
656 packedMask = ALL_ONE_MASK >> shift; | |
657 saIndex = bwt->inverseSa0; | |
658 rankIndex = numChar - 1; | |
659 | |
660 lastWord = numChar / CHAR_PER_WORD; | |
661 for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 | |
662 t = packedText[i]; | |
663 for (j=0; j<CHAR_PER_WORD; j++) { | |
664 c = t & packedMask; | |
665 saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; | |
666 // A counting sort using the first character of suffix is done here | |
667 // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 | |
668 if (saIndex > bwt->inverseSa0) { | |
669 seq[seqIndexFromEnd[c]] = rankIndex; | |
670 absoluteRank[seqIndexFromEnd[c]] = saIndex; | |
671 seqIndexFromEnd[c]--; | |
672 } else { | |
673 seq[seqIndexFromStart[c]] = rankIndex; | |
674 absoluteRank[seqIndexFromStart[c]] = saIndex; | |
675 seqIndexFromStart[c]++; | |
676 } | |
677 rankIndex--; | |
678 t >>= BIT_PER_CHAR; | |
679 } | |
680 } | |
681 | |
682 absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters | |
683 seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; | |
684 | |
685 return seqIndexFromStart[firstCharInLastIteration]; | |
686 } | |
687 | |
688 static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem) | |
689 { | |
690 #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD | |
691 | |
692 int64_t lowIndex, highIndex, midIndex; | |
693 int64_t lowPartitionIndex, highPartitionIndex; | |
694 int64_t lowStack[32], highStack[32]; | |
695 int stackDepth; | |
696 int64_t i, j; | |
697 bgint_t tempSeq, tempKey; | |
698 int64_t numberOfEqualKey; | |
699 | |
700 if (numItem < 2) return; | |
701 | |
702 stackDepth = 0; | |
703 | |
704 lowIndex = 0; | |
705 highIndex = numItem - 1; | |
706 | |
707 for (;;) { | |
708 | |
709 for (;;) { | |
710 | |
711 // Sort small array of data | |
712 if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays | |
713 for (i=lowIndex+1; i<=highIndex; i++) { | |
714 tempSeq = seq[i]; | |
715 tempKey = key[i]; | |
716 for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { | |
717 seq[j] = seq[j-1]; | |
718 key[j] = key[j-1]; | |
719 } | |
720 if (j != i) { | |
721 seq[j] = tempSeq; | |
722 key[j] = tempKey; | |
723 } | |
724 } | |
725 break; | |
726 } | |
727 | |
728 // Choose pivot as median of the lowest, middle, and highest data; sort the three data | |
729 | |
730 midIndex = average(lowIndex, highIndex); | |
731 if (key[lowIndex] > key[midIndex]) { | |
732 tempSeq = seq[lowIndex]; | |
733 tempKey = key[lowIndex]; | |
734 seq[lowIndex] = seq[midIndex]; | |
735 key[lowIndex] = key[midIndex]; | |
736 seq[midIndex] = tempSeq; | |
737 key[midIndex] = tempKey; | |
738 } | |
739 if (key[lowIndex] > key[highIndex]) { | |
740 tempSeq = seq[lowIndex]; | |
741 tempKey = key[lowIndex]; | |
742 seq[lowIndex] = seq[highIndex]; | |
743 key[lowIndex] = key[highIndex]; | |
744 seq[highIndex] = tempSeq; | |
745 key[highIndex] = tempKey; | |
746 } | |
747 if (key[midIndex] > key[highIndex]) { | |
748 tempSeq = seq[midIndex]; | |
749 tempKey = key[midIndex]; | |
750 seq[midIndex] = seq[highIndex]; | |
751 key[midIndex] = key[highIndex]; | |
752 seq[highIndex] = tempSeq; | |
753 key[highIndex] = tempKey; | |
754 } | |
755 | |
756 // Partition data | |
757 | |
758 numberOfEqualKey = 0; | |
759 | |
760 lowPartitionIndex = lowIndex + 1; | |
761 highPartitionIndex = highIndex - 1; | |
762 | |
763 for (;;) { | |
764 while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { | |
765 numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); | |
766 lowPartitionIndex++; | |
767 } | |
768 while (lowPartitionIndex < highPartitionIndex) { | |
769 if (key[midIndex] >= key[highPartitionIndex]) { | |
770 numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); | |
771 break; | |
772 } | |
773 highPartitionIndex--; | |
774 } | |
775 if (lowPartitionIndex >= highPartitionIndex) { | |
776 break; | |
777 } | |
778 tempSeq = seq[lowPartitionIndex]; | |
779 tempKey = key[lowPartitionIndex]; | |
780 seq[lowPartitionIndex] = seq[highPartitionIndex]; | |
781 key[lowPartitionIndex] = key[highPartitionIndex]; | |
782 seq[highPartitionIndex] = tempSeq; | |
783 key[highPartitionIndex] = tempKey; | |
784 if (highPartitionIndex == midIndex) { | |
785 // partition key has been moved | |
786 midIndex = lowPartitionIndex; | |
787 } | |
788 lowPartitionIndex++; | |
789 highPartitionIndex--; | |
790 } | |
791 | |
792 // Adjust the partition index | |
793 highPartitionIndex = lowPartitionIndex; | |
794 lowPartitionIndex--; | |
795 | |
796 // move the partition key to end of low partition | |
797 tempSeq = seq[midIndex]; | |
798 tempKey = key[midIndex]; | |
799 seq[midIndex] = seq[lowPartitionIndex]; | |
800 key[midIndex] = key[lowPartitionIndex]; | |
801 seq[lowPartitionIndex] = tempSeq; | |
802 key[lowPartitionIndex] = tempKey; | |
803 | |
804 if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { | |
805 | |
806 // Many keys = partition key; separate the equal key data from the lower partition | |
807 | |
808 midIndex = lowIndex; | |
809 | |
810 for (;;) { | |
811 while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { | |
812 midIndex++; | |
813 } | |
814 while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { | |
815 lowPartitionIndex--; | |
816 } | |
817 if (midIndex >= lowPartitionIndex) { | |
818 break; | |
819 } | |
820 tempSeq = seq[midIndex]; | |
821 tempKey = key[midIndex]; | |
822 seq[midIndex] = seq[lowPartitionIndex - 1]; | |
823 key[midIndex] = key[lowPartitionIndex - 1]; | |
824 seq[lowPartitionIndex - 1] = tempSeq; | |
825 key[lowPartitionIndex - 1] = tempKey; | |
826 midIndex++; | |
827 lowPartitionIndex--; | |
828 } | |
829 | |
830 } | |
831 | |
832 if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { | |
833 // put the larger partition to stack | |
834 lowStack[stackDepth] = lowIndex; | |
835 highStack[stackDepth] = lowPartitionIndex - 1; | |
836 stackDepth++; | |
837 // sort the smaller partition first | |
838 lowIndex = highPartitionIndex; | |
839 } else { | |
840 // put the larger partition to stack | |
841 lowStack[stackDepth] = highPartitionIndex; | |
842 highStack[stackDepth] = highIndex; | |
843 stackDepth++; | |
844 // sort the smaller partition first | |
845 if (lowPartitionIndex > lowIndex) { | |
846 highIndex = lowPartitionIndex - 1; | |
847 } else { | |
848 // all keys in the partition equals to the partition key | |
849 break; | |
850 } | |
851 } | |
852 continue; | |
853 } | |
854 | |
855 // Pop a range from stack | |
856 if (stackDepth > 0) { | |
857 stackDepth--; | |
858 lowIndex = lowStack[stackDepth]; | |
859 highIndex = highStack[stackDepth]; | |
860 continue; | |
861 } else return; | |
862 } | |
863 } | |
864 | |
865 | |
866 static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq, | |
867 bgint_t* __restrict relativeRank, const bgint_t numItem, | |
868 bgint_t oldInverseSa0, const bgint_t *cumulativeCount) | |
869 { | |
870 bgint_t i, c; | |
871 bgint_t s, r; | |
872 bgint_t lastRank, lastIndex; | |
873 bgint_t oldInverseSa0RelativeRank = 0; | |
874 bgint_t freq; | |
875 | |
876 lastIndex = numItem; | |
877 lastRank = sortedRank[numItem]; | |
878 if (lastRank > oldInverseSa0) { | |
879 sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt | |
880 } | |
881 s = seq[numItem]; | |
882 relativeRank[s] = numItem; | |
883 if (lastRank == oldInverseSa0) { | |
884 oldInverseSa0RelativeRank = numItem; | |
885 oldInverseSa0++; // so that this segment of code is not run again | |
886 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item | |
887 } | |
888 | |
889 c = ALPHABET_SIZE - 1; | |
890 freq = cumulativeCount[c]; | |
891 | |
892 for (i=numItem; i--;) { // from numItem - 1 to 0 | |
893 r = sortedRank[i]; | |
894 if (r > oldInverseSa0) | |
895 sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt | |
896 s = seq[i]; | |
897 if (i < freq) { | |
898 if (lastIndex >= freq) | |
899 lastRank++; // to trigger the group across alphabet boundary to be split | |
900 c--; | |
901 freq = cumulativeCount[c]; | |
902 } | |
903 if (r == lastRank) { | |
904 relativeRank[s] = lastIndex; | |
905 } else { | |
906 if (i == lastIndex - 1) { | |
907 if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { | |
908 seq[lastIndex] = seq[lastIndex + 1] - 1; | |
909 } else { | |
910 seq[lastIndex] = (bgint_t)-1; | |
911 } | |
912 } | |
913 lastIndex = i; | |
914 lastRank = r; | |
915 relativeRank[s] = i; | |
916 if (r == oldInverseSa0) { | |
917 oldInverseSa0RelativeRank = i; | |
918 oldInverseSa0++; // so that this segment of code is not run again | |
919 lastRank++; // so that oldInverseSa0 become a sorted group with 1 item | |
920 } | |
921 } | |
922 } | |
923 | |
924 } | |
925 | |
926 static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, | |
927 const bgint_t *cumulativeCount) | |
928 { | |
929 unsigned int c; | |
930 bgint_t i; | |
931 bgint_t previousRank, currentRank; | |
932 | |
933 previousRank = relativeRank[0]; | |
934 | |
935 for (i=1; i<=numChar; i++) { | |
936 currentRank = relativeRank[i]; | |
937 c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) | |
938 + (previousRank >= cumulativeCount[3]); | |
939 insertBwt[currentRank] = c; | |
940 previousRank = currentRank; | |
941 } | |
942 } | |
943 | |
944 static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, | |
945 unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) | |
946 { | |
947 unsigned int bitsInWordMinusBitPerChar; | |
948 bgint_t leftShift, rightShift; | |
949 bgint_t o; | |
950 bgint_t oIndex, iIndex, mIndex; | |
951 bgint_t mWord, mChar, oWord, oChar; | |
952 bgint_t numInsert; | |
953 | |
954 bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; | |
955 | |
956 oIndex = 0; | |
957 iIndex = 0; | |
958 mIndex = 0; | |
959 | |
960 mWord = 0; | |
961 mChar = 0; | |
962 | |
963 mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration | |
964 | |
965 while (oIndex < numOldBwt) { | |
966 | |
967 // copy from insertBwt | |
968 while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { | |
969 if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 | |
970 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); | |
971 mIndex++; | |
972 mChar++; | |
973 if (mChar == CHAR_PER_WORD) { | |
974 mChar = 0; | |
975 mWord++; | |
976 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary | |
977 } | |
978 } | |
979 iIndex++; | |
980 } | |
981 | |
982 // Copy from oldBwt to mergedBwt | |
983 if (iIndex <= numInsertBwt) { | |
984 o = sortedRank[iIndex]; | |
985 } else { | |
986 o = numOldBwt; | |
987 } | |
988 numInsert = o - oIndex; | |
989 | |
990 oWord = oIndex / CHAR_PER_WORD; | |
991 oChar = oIndex - oWord * CHAR_PER_WORD; | |
992 if (oChar > mChar) { | |
993 leftShift = (oChar - mChar) * BIT_PER_CHAR; | |
994 rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; | |
995 mergedBwt[mWord] = mergedBwt[mWord] | |
996 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) | |
997 | (oldBwt[oWord+1] >> rightShift); | |
998 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
999 while (o > oIndex) { | |
1000 oWord++; | |
1001 mWord++; | |
1002 mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); | |
1003 oIndex += CHAR_PER_WORD; | |
1004 } | |
1005 } else if (oChar < mChar) { | |
1006 rightShift = (mChar - oChar) * BIT_PER_CHAR; | |
1007 leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; | |
1008 mergedBwt[mWord] = mergedBwt[mWord] | |
1009 | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); | |
1010 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
1011 while (o > oIndex) { | |
1012 oWord++; | |
1013 mWord++; | |
1014 mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); | |
1015 oIndex += CHAR_PER_WORD; | |
1016 } | |
1017 } else { // oChar == mChar | |
1018 mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); | |
1019 oIndex += min(numInsert, CHAR_PER_WORD - mChar); | |
1020 while (o > oIndex) { | |
1021 oWord++; | |
1022 mWord++; | |
1023 mergedBwt[mWord] = oldBwt[oWord]; | |
1024 oIndex += CHAR_PER_WORD; | |
1025 } | |
1026 } | |
1027 oIndex = o; | |
1028 mIndex += numInsert; | |
1029 | |
1030 // Clear the trailing garbage in mergedBwt | |
1031 mWord = mIndex / CHAR_PER_WORD; | |
1032 mChar = mIndex - mWord * CHAR_PER_WORD; | |
1033 if (mChar == 0) { | |
1034 mergedBwt[mWord] = 0; | |
1035 } else { | |
1036 mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); | |
1037 } | |
1038 | |
1039 } | |
1040 | |
1041 // copy from insertBwt | |
1042 while (iIndex <= numInsertBwt) { | |
1043 if (sortedRank[iIndex] != 0) { | |
1044 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); | |
1045 mIndex++; | |
1046 mChar++; | |
1047 if (mChar == CHAR_PER_WORD) { | |
1048 mChar = 0; | |
1049 mWord++; | |
1050 mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary | |
1051 } | |
1052 } | |
1053 iIndex++; | |
1054 } | |
1055 } | |
1056 | |
1057 void BWTClearTrailingBwtCode(BWT *bwt) | |
1058 { | |
1059 bgint_t bwtResidentSizeInWord; | |
1060 bgint_t wordIndex, offset; | |
1061 bgint_t i; | |
1062 | |
1063 bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); | |
1064 | |
1065 wordIndex = bwt->textLength / CHAR_PER_WORD; | |
1066 offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; | |
1067 if (offset > 0) { | |
1068 bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); | |
1069 } else { | |
1070 if (wordIndex < bwtResidentSizeInWord) { | |
1071 bwt->bwtCode[wordIndex] = 0; | |
1072 } | |
1073 } | |
1074 | |
1075 for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) { | |
1076 bwt->bwtCode[i] = 0; | |
1077 } | |
1078 } | |
1079 | |
1080 | |
1081 void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, | |
1082 bgint_t* __restrict occValueMajor, | |
1083 const bgint_t textLength, const unsigned int* decodeTable) | |
1084 { | |
1085 bgint_t numberOfOccValueMajor, numberOfOccValue; | |
1086 unsigned int wordBetweenOccValue; | |
1087 bgint_t numberOfOccIntervalPerMajor; | |
1088 unsigned int c; | |
1089 bgint_t i, j; | |
1090 bgint_t occMajorIndex; | |
1091 bgint_t occIndex, bwtIndex; | |
1092 bgint_t sum; // perhaps unsigned is big enough | |
1093 bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; | |
1094 | |
1095 wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; | |
1096 | |
1097 // Calculate occValue | |
1098 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding | |
1099 numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; | |
1100 numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; | |
1101 | |
1102 tempOccValue0[0] = 0; | |
1103 tempOccValue0[1] = 0; | |
1104 tempOccValue0[2] = 0; | |
1105 tempOccValue0[3] = 0; | |
1106 occValueMajor[0] = 0; | |
1107 occValueMajor[1] = 0; | |
1108 occValueMajor[2] = 0; | |
1109 occValueMajor[3] = 0; | |
1110 | |
1111 occIndex = 0; | |
1112 bwtIndex = 0; | |
1113 for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) { | |
1114 | |
1115 for (i=0; i<numberOfOccIntervalPerMajor/2; i++) { | |
1116 | |
1117 sum = 0; | |
1118 tempOccValue1[0] = tempOccValue0[0]; | |
1119 tempOccValue1[1] = tempOccValue0[1]; | |
1120 tempOccValue1[2] = tempOccValue0[2]; | |
1121 tempOccValue1[3] = tempOccValue0[3]; | |
1122 | |
1123 for (j=0; j<wordBetweenOccValue; j++) { | |
1124 c = bwt[bwtIndex]; | |
1125 sum += decodeTable[c >> 16]; | |
1126 sum += decodeTable[c & 0x0000FFFF]; | |
1127 bwtIndex++; | |
1128 } | |
1129 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1130 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
1131 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
1132 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
1133 tempOccValue1[3] += sum; | |
1134 } else { | |
1135 if (sum == 0x00000100) { | |
1136 tempOccValue1[0] += 256; | |
1137 } else if (sum == 0x00010000) { | |
1138 tempOccValue1[1] += 256; | |
1139 } else if (sum == 0x01000000) { | |
1140 tempOccValue1[2] += 256; | |
1141 } else { | |
1142 tempOccValue1[3] += 256; | |
1143 } | |
1144 } | |
1145 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
1146 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
1147 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
1148 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
1149 tempOccValue0[0] = tempOccValue1[0]; | |
1150 tempOccValue0[1] = tempOccValue1[1]; | |
1151 tempOccValue0[2] = tempOccValue1[2]; | |
1152 tempOccValue0[3] = tempOccValue1[3]; | |
1153 sum = 0; | |
1154 | |
1155 occIndex++; | |
1156 | |
1157 for (j=0; j<wordBetweenOccValue; j++) { | |
1158 c = bwt[bwtIndex]; | |
1159 sum += decodeTable[c >> 16]; | |
1160 sum += decodeTable[c & 0x0000FFFF]; | |
1161 bwtIndex++; | |
1162 } | |
1163 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1164 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; | |
1165 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; | |
1166 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; | |
1167 tempOccValue0[3] += sum; | |
1168 } else { | |
1169 if (sum == 0x00000100) { | |
1170 tempOccValue0[0] += 256; | |
1171 } else if (sum == 0x00010000) { | |
1172 tempOccValue0[1] += 256; | |
1173 } else if (sum == 0x01000000) { | |
1174 tempOccValue0[2] += 256; | |
1175 } else { | |
1176 tempOccValue0[3] += 256; | |
1177 } | |
1178 } | |
1179 } | |
1180 | |
1181 occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; | |
1182 occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; | |
1183 occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; | |
1184 occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; | |
1185 tempOccValue0[0] = 0; | |
1186 tempOccValue0[1] = 0; | |
1187 tempOccValue0[2] = 0; | |
1188 tempOccValue0[3] = 0; | |
1189 | |
1190 } | |
1191 | |
1192 while (occIndex < (numberOfOccValue-1)/2) { | |
1193 sum = 0; | |
1194 tempOccValue1[0] = tempOccValue0[0]; | |
1195 tempOccValue1[1] = tempOccValue0[1]; | |
1196 tempOccValue1[2] = tempOccValue0[2]; | |
1197 tempOccValue1[3] = tempOccValue0[3]; | |
1198 for (j=0; j<wordBetweenOccValue; j++) { | |
1199 c = bwt[bwtIndex]; | |
1200 sum += decodeTable[c >> 16]; | |
1201 sum += decodeTable[c & 0x0000FFFF]; | |
1202 bwtIndex++; | |
1203 } | |
1204 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1205 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
1206 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
1207 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
1208 tempOccValue1[3] += sum; | |
1209 } else { | |
1210 if (sum == 0x00000100) { | |
1211 tempOccValue1[0] += 256; | |
1212 } else if (sum == 0x00010000) { | |
1213 tempOccValue1[1] += 256; | |
1214 } else if (sum == 0x01000000) { | |
1215 tempOccValue1[2] += 256; | |
1216 } else { | |
1217 tempOccValue1[3] += 256; | |
1218 } | |
1219 } | |
1220 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
1221 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
1222 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
1223 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
1224 tempOccValue0[0] = tempOccValue1[0]; | |
1225 tempOccValue0[1] = tempOccValue1[1]; | |
1226 tempOccValue0[2] = tempOccValue1[2]; | |
1227 tempOccValue0[3] = tempOccValue1[3]; | |
1228 sum = 0; | |
1229 occIndex++; | |
1230 | |
1231 for (j=0; j<wordBetweenOccValue; j++) { | |
1232 c = bwt[bwtIndex]; | |
1233 sum += decodeTable[c >> 16]; | |
1234 sum += decodeTable[c & 0x0000FFFF]; | |
1235 bwtIndex++; | |
1236 } | |
1237 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1238 tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; | |
1239 tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; | |
1240 tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; | |
1241 tempOccValue0[3] += sum; | |
1242 } else { | |
1243 if (sum == 0x00000100) { | |
1244 tempOccValue0[0] += 256; | |
1245 } else if (sum == 0x00010000) { | |
1246 tempOccValue0[1] += 256; | |
1247 } else if (sum == 0x01000000) { | |
1248 tempOccValue0[2] += 256; | |
1249 } else { | |
1250 tempOccValue0[3] += 256; | |
1251 } | |
1252 } | |
1253 } | |
1254 | |
1255 sum = 0; | |
1256 tempOccValue1[0] = tempOccValue0[0]; | |
1257 tempOccValue1[1] = tempOccValue0[1]; | |
1258 tempOccValue1[2] = tempOccValue0[2]; | |
1259 tempOccValue1[3] = tempOccValue0[3]; | |
1260 | |
1261 if (occIndex * 2 < numberOfOccValue - 1) { | |
1262 for (j=0; j<wordBetweenOccValue; j++) { | |
1263 c = bwt[bwtIndex]; | |
1264 sum += decodeTable[c >> 16]; | |
1265 sum += decodeTable[c & 0x0000FFFF]; | |
1266 bwtIndex++; | |
1267 } | |
1268 if (!DNA_OCC_SUM_EXCEPTION(sum)) { | |
1269 tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; | |
1270 tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; | |
1271 tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; | |
1272 tempOccValue1[3] += sum; | |
1273 } else { | |
1274 if (sum == 0x00000100) { | |
1275 tempOccValue1[0] += 256; | |
1276 } else if (sum == 0x00010000) { | |
1277 tempOccValue1[1] += 256; | |
1278 } else if (sum == 0x01000000) { | |
1279 tempOccValue1[2] += 256; | |
1280 } else { | |
1281 tempOccValue1[3] += 256; | |
1282 } | |
1283 } | |
1284 } | |
1285 | |
1286 occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; | |
1287 occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; | |
1288 occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; | |
1289 occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; | |
1290 | |
1291 } | |
1292 | |
1293 static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) | |
1294 { | |
1295 unsigned int i; | |
1296 bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; | |
1297 unsigned int firstCharInThisIteration; | |
1298 | |
1299 bgint_t *relativeRank, *seq, *sortedRank; | |
1300 unsigned int *insertBwt, *mergedBwt; | |
1301 bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; | |
1302 | |
1303 mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); | |
1304 mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); | |
1305 | |
1306 initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); | |
1307 | |
1308 if (bwtInc->bwt->textLength == 0) { // Initial build | |
1309 | |
1310 // Set address | |
1311 seq = (bgint_t*)bwtInc->workingMemory; | |
1312 relativeRank = seq + bwtInc->buildSize + 1; | |
1313 // mergedBwt and packedTex may share memory | |
1314 mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place | |
1315 | |
1316 assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText); | |
1317 assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt); | |
1318 | |
1319 // ->packedText is not used any more and may be overwritten by mergedBwt | |
1320 BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); | |
1321 | |
1322 firstCharInThisIteration = relativeRank[0]; | |
1323 relativeRank[numChar] = 0; | |
1324 | |
1325 // Sort suffix | |
1326 QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); | |
1327 newInverseSa0 = relativeRank[0]; | |
1328 | |
1329 // Clear BWT area | |
1330 initializeVAL(insertBwt, mergedBwtSizeInWord, 0); | |
1331 | |
1332 // Build BWT | |
1333 BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); | |
1334 | |
1335 // so that the cumulativeCount is not deducted | |
1336 bwtInc->firstCharInLastIteration = ALPHABET_SIZE; | |
1337 | |
1338 } else { // Incremental build | |
1339 // Set address | |
1340 sortedRank = (bgint_t*)bwtInc->workingMemory; | |
1341 seq = sortedRank + bwtInc->buildSize + 1; | |
1342 insertBwt = (unsigned*)seq; // insertBwt and seq share memory | |
1343 // relativeRank and ->packedText may share memory | |
1344 relativeRank = seq + bwtInc->buildSize + 1; | |
1345 | |
1346 assert((void*)relativeRank <= (void*)bwtInc->packedText); | |
1347 | |
1348 // Store the first character of this iteration | |
1349 firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); | |
1350 | |
1351 // Count occurrence of input text | |
1352 ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); | |
1353 // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration | |
1354 bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; | |
1355 bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; | |
1356 bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; | |
1357 bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; | |
1358 | |
1359 // Get rank of new suffix among processed suffix | |
1360 // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group | |
1361 // ->packedText is not used any more and will be overwritten by relativeRank | |
1362 oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, | |
1363 numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); | |
1364 | |
1365 // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) | |
1366 for (i=0; i<ALPHABET_SIZE; i++) { | |
1367 if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || | |
1368 bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { | |
1369 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); | |
1370 } else { | |
1371 if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { | |
1372 BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); | |
1373 } | |
1374 if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { | |
1375 BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); | |
1376 } | |
1377 } | |
1378 } | |
1379 | |
1380 // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt | |
1381 // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) | |
1382 BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); | |
1383 assert(relativeRank[numChar] == oldInverseSa0RelativeRank); | |
1384 | |
1385 // Sort suffix | |
1386 QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); | |
1387 | |
1388 newInverseSa0RelativeRank = relativeRank[0]; | |
1389 newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; | |
1390 | |
1391 sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt | |
1392 | |
1393 // Build BWT; seq is overwritten by insertBwt | |
1394 BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); | |
1395 | |
1396 // Merge BWT; relativeRank may be overwritten by mergedBwt | |
1397 mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord | |
1398 - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging | |
1399 assert(mergedBwt >= insertBwt + numChar); | |
1400 BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); | |
1401 } | |
1402 | |
1403 // Build auxiliary structure and update info and pointers in BWT | |
1404 bwtInc->bwt->textLength += numChar; | |
1405 bwtInc->bwt->bwtCode = mergedBwt; | |
1406 bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; | |
1407 bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; | |
1408 assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); | |
1409 | |
1410 bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; | |
1411 | |
1412 BWTClearTrailingBwtCode(bwtInc->bwt); | |
1413 BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, | |
1414 bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); | |
1415 | |
1416 bwtInc->bwt->inverseSa0 = newInverseSa0; | |
1417 | |
1418 bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); | |
1419 bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); | |
1420 bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); | |
1421 bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); | |
1422 | |
1423 bwtInc->firstCharInLastIteration = firstCharInThisIteration; | |
1424 | |
1425 // Set build size and text address for the next build | |
1426 BWTIncSetBuildSizeAndTextAddr(bwtInc); | |
1427 bwtInc->numberOfIterationDone++; | |
1428 | |
1429 } | |
1430 | |
1431 BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) | |
1432 { | |
1433 | |
1434 FILE *packedFile; | |
1435 bgint_t packedFileLen; | |
1436 bgint_t totalTextLength; | |
1437 bgint_t textToLoad, textSizeInByte; | |
1438 bgint_t processedTextLength; | |
1439 unsigned char lastByteLength; | |
1440 | |
1441 BWTInc *bwtInc; | |
1442 | |
1443 packedFile = (FILE*)fopen(inputFileName, "rb"); | |
1444 | |
1445 if (packedFile == NULL) { | |
1446 fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); | |
1447 exit(1); | |
1448 } | |
1449 | |
1450 fseek(packedFile, -1, SEEK_END); | |
1451 packedFileLen = ftell(packedFile); | |
1452 fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); | |
1453 totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); | |
1454 | |
1455 bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); | |
1456 | |
1457 BWTIncSetBuildSizeAndTextAddr(bwtInc); | |
1458 | |
1459 if (bwtInc->buildSize > totalTextLength) { | |
1460 textToLoad = totalTextLength; | |
1461 } else { | |
1462 textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); | |
1463 } | |
1464 textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte | |
1465 | |
1466 fseek(packedFile, -2, SEEK_CUR); | |
1467 fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); | |
1468 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); | |
1469 fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); | |
1470 | |
1471 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); | |
1472 BWTIncConstruct(bwtInc, textToLoad); | |
1473 | |
1474 processedTextLength = textToLoad; | |
1475 | |
1476 while (processedTextLength < totalTextLength) { | |
1477 textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; | |
1478 if (textToLoad > totalTextLength - processedTextLength) { | |
1479 textToLoad = totalTextLength - processedTextLength; | |
1480 } | |
1481 textSizeInByte = textToLoad / CHAR_PER_BYTE; | |
1482 fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); | |
1483 fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); | |
1484 fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); | |
1485 ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); | |
1486 BWTIncConstruct(bwtInc, textToLoad); | |
1487 processedTextLength += textToLoad; | |
1488 if (bwtInc->numberOfIterationDone % 10 == 0) { | |
1489 fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", | |
1490 (long)bwtInc->numberOfIterationDone, (long)processedTextLength); | |
1491 } | |
1492 } | |
1493 return bwtInc; | |
1494 } | |
1495 | |
1496 void BWTFree(BWT *bwt) | |
1497 { | |
1498 if (bwt == 0) return; | |
1499 free(bwt->cumulativeFreq); | |
1500 free(bwt->bwtCode); | |
1501 free(bwt->occValue); | |
1502 free(bwt->occValueMajor); | |
1503 free(bwt->decodeTable); | |
1504 free(bwt); | |
1505 } | |
1506 | |
1507 void BWTIncFree(BWTInc *bwtInc) | |
1508 { | |
1509 if (bwtInc == 0) return; | |
1510 free(bwtInc->bwt); | |
1511 free(bwtInc->workingMemory); | |
1512 free(bwtInc); | |
1513 } | |
1514 | |
1515 static bgint_t BWTFileSizeInWord(const bgint_t numChar) | |
1516 { | |
1517 // The $ in BWT at the position of inverseSa0 is not encoded | |
1518 return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; | |
1519 } | |
1520 | |
1521 void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) | |
1522 { | |
1523 FILE *bwtFile; | |
1524 /* FILE *occValueFile; */ | |
1525 bgint_t bwtLength; | |
1526 | |
1527 bwtFile = (FILE*)fopen(bwtFileName, "wb"); | |
1528 if (bwtFile == NULL) { | |
1529 fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); | |
1530 exit(1); | |
1531 } | |
1532 | |
1533 fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); | |
1534 fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); | |
1535 bwtLength = BWTFileSizeInWord(bwt->textLength); | |
1536 fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); | |
1537 fclose(bwtFile); | |
1538 } | |
1539 | |
1540 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) | |
1541 { | |
1542 BWTInc *bwtInc; | |
1543 bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000); | |
1544 printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); | |
1545 BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); | |
1546 BWTIncFree(bwtInc); | |
1547 } | |
1548 | |
1549 int bwt_bwtgen_main(int argc, char *argv[]) | |
1550 { | |
1551 if (argc < 3) { | |
1552 fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n"); | |
1553 return 1; | |
1554 } | |
1555 bwt_bwtgen(argv[1], argv[2]); | |
1556 return 0; | |
1557 } | |
1558 | |
1559 #ifdef MAIN_BWT_GEN | |
1560 | |
1561 int main(int argc, char *argv[]) | |
1562 { | |
1563 return bwt_bwtgen_main(argc, argv); | |
1564 } | |
1565 | |
1566 #endif |