Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/io_lib/hash_table.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 #ifdef HAVE_CONFIG_H | |
2 #include "io_lib_config.h" | |
3 #endif | |
4 | |
5 #include <stdio.h> | |
6 #include <string.h> | |
7 #include <stdlib.h> | |
8 #include "io_lib/os.h" | |
9 #include "io_lib/hash_table.h" | |
10 #include "io_lib/jenkins_lookup3.h" | |
11 | |
12 /* ========================================================================= | |
13 * TCL's hash function. Basically hash*9 + char. | |
14 * ========================================================================= | |
15 */ | |
16 | |
17 uint32_t HashTcl(uint8_t *data, int len) { | |
18 uint32_t hash = 0; | |
19 int i; | |
20 | |
21 for (i = 0; i < len; i++) { | |
22 hash += (hash<<3) + data[i]; | |
23 } | |
24 | |
25 return hash; | |
26 } | |
27 | |
28 /* ========================================================================= | |
29 * Paul Hsieh's hash function | |
30 * http://www.azillionmonkeys.com/qed/hash.html | |
31 * ========================================================================= | |
32 */ | |
33 | |
34 #undef get16bits | |
35 #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ | |
36 || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) | |
37 #define get16bits(d) (*((const uint16_t *) (d))) | |
38 #endif | |
39 | |
40 #if !defined (get16bits) | |
41 #define get16bits(d) ((((const uint8_t *)(d))[1] << 8UL)\ | |
42 +((const uint8_t *)(d))[0]) | |
43 #endif | |
44 | |
45 uint32_t HashHsieh(uint8_t *data, int len) { | |
46 uint32_t hash = 0, tmp; | |
47 int rem; | |
48 | |
49 if (len <= 0 || data == NULL) return 0; | |
50 | |
51 rem = len & 3; | |
52 len >>= 2; | |
53 | |
54 /* Main loop */ | |
55 for (;len > 0; len--) { | |
56 hash += get16bits (data); | |
57 tmp = (get16bits (data+2) << 11) ^ hash; | |
58 hash = (hash << 16) ^ tmp; | |
59 data += 2*sizeof (uint16_t); | |
60 hash += hash >> 11; | |
61 } | |
62 | |
63 /* Handle end cases */ | |
64 switch (rem) { | |
65 case 3: hash += get16bits (data); | |
66 hash ^= hash << 16; | |
67 hash ^= data[sizeof (uint16_t)] << 18; | |
68 hash += hash >> 11; | |
69 break; | |
70 case 2: hash += get16bits (data); | |
71 hash ^= hash << 11; | |
72 hash += hash >> 17; | |
73 break; | |
74 case 1: hash += *data; | |
75 hash ^= hash << 10; | |
76 hash += hash >> 1; | |
77 } | |
78 | |
79 /* Force "avalanching" of final 127 bits */ | |
80 hash ^= hash << 3; | |
81 hash += hash >> 5; | |
82 hash ^= hash << 2; | |
83 hash += hash >> 15; | |
84 hash ^= hash << 10; | |
85 | |
86 return hash; | |
87 } | |
88 | |
89 /* ========================================================================= | |
90 * Bob Jenkins' hash function | |
91 * http://burtleburtle.net/bob/hash/doobs.html | |
92 * | |
93 * See jenkins_lookup3.c for a new version of this that has good hash | |
94 * characteristics for a full 64-bit hash value. | |
95 * ========================================================================= | |
96 */ | |
97 | |
98 #define hashsize(n) ((uint32_t)1<<(n)) | |
99 #define hashmask(n) (hashsize(n)-1) | |
100 | |
101 /* | |
102 -------------------------------------------------------------------- | |
103 mix -- mix 3 32-bit values reversibly. | |
104 For every delta with one or two bits set, and the deltas of all three | |
105 high bits or all three low bits, whether the original value of a,b,c | |
106 is almost all zero or is uniformly distributed, | |
107 * If mix() is run forward or backward, at least 32 bits in a,b,c | |
108 have at least 1/4 probability of changing. | |
109 * If mix() is run forward, every bit of c will change between 1/3 and | |
110 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) | |
111 mix() was built out of 36 single-cycle latency instructions in a | |
112 structure that could supported 2x parallelism, like so: | |
113 a -= b; | |
114 a -= c; x = (c>>13); | |
115 b -= c; a ^= x; | |
116 b -= a; x = (a<<8); | |
117 c -= a; b ^= x; | |
118 c -= b; x = (b>>13); | |
119 ... | |
120 Unfortunately, superscalar Pentiums and Sparcs can't take advantage | |
121 of that parallelism. They've also turned some of those single-cycle | |
122 latency instructions into multi-cycle latency instructions. Still, | |
123 this is the fastest good hash I could find. There were about 2^^68 | |
124 to choose from. I only looked at a billion or so. | |
125 -------------------------------------------------------------------- | |
126 */ | |
127 #define mix(a,b,c) \ | |
128 { \ | |
129 a -= b; a -= c; a ^= (c>>13); \ | |
130 b -= c; b -= a; b ^= (a<<8); \ | |
131 c -= a; c -= b; c ^= (b>>13); \ | |
132 a -= b; a -= c; a ^= (c>>12); \ | |
133 b -= c; b -= a; b ^= (a<<16); \ | |
134 c -= a; c -= b; c ^= (b>>5); \ | |
135 a -= b; a -= c; a ^= (c>>3); \ | |
136 b -= c; b -= a; b ^= (a<<10); \ | |
137 c -= a; c -= b; c ^= (b>>15); \ | |
138 } | |
139 | |
140 /* | |
141 -------------------------------------------------------------------- | |
142 hash() -- hash a variable-length key into a 32-bit value | |
143 k : the key (the unaligned variable-length array of bytes) | |
144 len : the length of the key, counting by bytes | |
145 initval : can be any 4-byte value | |
146 Returns a 32-bit value. Every bit of the key affects every bit of | |
147 the return value. Every 1-bit and 2-bit delta achieves avalanche. | |
148 About 6*len+35 instructions. | |
149 | |
150 The best hash table sizes are powers of 2. There is no need to do | |
151 mod a prime (mod is sooo slow!). If you need less than 32 bits, | |
152 use a bitmask. For example, if you need only 10 bits, do | |
153 h = (h & hashmask(10)); | |
154 In which case, the hash table should have hashsize(10) elements. | |
155 | |
156 If you are hashing n strings (uint8_t **)k, do it like this: | |
157 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); | |
158 | |
159 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this | |
160 code any way you wish, private, educational, or commercial. It's free. | |
161 | |
162 See http://burtleburtle.net/bob/hash/evahash.html | |
163 Use for hash table lookup, or anything where one collision in 2^^32 is | |
164 acceptable. Do NOT use for cryptographic purposes. | |
165 -------------------------------------------------------------------- | |
166 */ | |
167 | |
168 uint32_t HashJenkins(uint8_t *k, int length /*, uint32_t initval */) | |
169 { | |
170 register uint32_t a,b,c,len; | |
171 | |
172 /* Set up the internal state */ | |
173 len = length; | |
174 a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ | |
175 c = 0; /* initval; */ /* the previous hash value */ | |
176 | |
177 /*---------------------------------------- handle most of the key */ | |
178 while (len >= 12) | |
179 { | |
180 a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24)); | |
181 b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24)); | |
182 c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24)); | |
183 mix(a,b,c); | |
184 k += 12; len -= 12; | |
185 } | |
186 | |
187 /*------------------------------------- handle the last 11 bytes */ | |
188 c += length; | |
189 switch(len) /* all the case statements fall through */ | |
190 { | |
191 case 11: c+=((uint32_t)k[10]<<24); | |
192 case 10: c+=((uint32_t)k[9]<<16); | |
193 case 9 : c+=((uint32_t)k[8]<<8); | |
194 /* the first byte of c is reserved for the length */ | |
195 case 8 : b+=((uint32_t)k[7]<<24); | |
196 case 7 : b+=((uint32_t)k[6]<<16); | |
197 case 6 : b+=((uint32_t)k[5]<<8); | |
198 case 5 : b+=k[4]; | |
199 case 4 : a+=((uint32_t)k[3]<<24); | |
200 case 3 : a+=((uint32_t)k[2]<<16); | |
201 case 2 : a+=((uint32_t)k[1]<<8); | |
202 case 1 : a+=k[0]; | |
203 /* case 0: nothing left to add */ | |
204 } | |
205 mix(a,b,c); | |
206 /*-------------------------------------------- report the result */ | |
207 return c; | |
208 } | |
209 | |
210 /* | |
211 * An interface to the above hash functions. | |
212 * Returns: | |
213 * A 32-bit hash key, suitable for masking down to smaller bit sizes | |
214 */ | |
215 uint32_t hash(int func, uint8_t *key, int key_len) { | |
216 switch (func) { | |
217 case HASH_FUNC_HSIEH: | |
218 return HashHsieh(key, key_len); | |
219 | |
220 case HASH_FUNC_TCL: | |
221 return HashTcl(key, key_len); | |
222 | |
223 case HASH_FUNC_JENKINS: | |
224 return HashJenkins(key, key_len); | |
225 | |
226 case HASH_FUNC_JENKINS3: | |
227 { | |
228 uint32_t pc = 0, pb = 0; | |
229 HashJenkins3(key, key_len, &pc, &pb); | |
230 return pc; | |
231 } | |
232 } | |
233 | |
234 return 0; | |
235 } | |
236 | |
237 /* | |
238 * As per hash() above but returns a 64-bit key. For 32-bit hash functions | |
239 * this is simply a duplication of the 32-bit value. | |
240 */ | |
241 uint64_t hash64(int func, uint8_t *key, int key_len) { | |
242 uint32_t pc = 0, pb = 0; | |
243 | |
244 switch (func) { | |
245 case HASH_FUNC_HSIEH: | |
246 pb = pc = HashHsieh(key, key_len); | |
247 break; | |
248 | |
249 case HASH_FUNC_TCL: | |
250 pb = pc = HashTcl(key, key_len); | |
251 break; | |
252 | |
253 case HASH_FUNC_JENKINS: | |
254 pb = pc = HashJenkins(key, key_len); | |
255 break; | |
256 | |
257 case HASH_FUNC_JENKINS3: | |
258 HashJenkins3(key, key_len, &pc, &pb); | |
259 break; | |
260 } | |
261 | |
262 return pc + (((uint64_t)pb)<<32); | |
263 } | |
264 | |
265 /* ========================================================================= | |
266 * Hash Table handling code | |
267 * ========================================================================= | |
268 */ | |
269 | |
270 /* Multiplicative factors indicating when to grow or shrink the hash table */ | |
271 #define HASH_TABLE_RESIZE 3 | |
272 | |
273 /* | |
274 * Creates a HashItem for use with HashTable h. | |
275 * | |
276 * Returns: | |
277 * A pointer to new HashItem on success | |
278 * NULL on failure. | |
279 */ | |
280 static HashItem *HashItemCreate(HashTable *h) { | |
281 HashItem *hi; | |
282 | |
283 hi = (h->options & HASH_POOL_ITEMS | |
284 ? pool_alloc(h->hi_pool) : malloc(sizeof(*hi))); | |
285 if (NULL == hi) return NULL; | |
286 | |
287 hi->data.p = NULL; | |
288 hi->data.i = 0; | |
289 hi->next = NULL; | |
290 hi->key = NULL; | |
291 hi->key_len = 0; | |
292 | |
293 h->nused++; | |
294 | |
295 return hi; | |
296 } | |
297 | |
298 /* | |
299 * Deallocates a HashItem created via HashItemCreate. | |
300 * | |
301 * This function will not remove the item from the HashTable so be sure to | |
302 * call HashTableDel() first if appropriate. | |
303 */ | |
304 static void HashItemDestroy(HashTable *h, HashItem *hi, int deallocate_data) { | |
305 if (!hi) return; | |
306 | |
307 if (!(h->options & HASH_NONVOLATILE_KEYS) || (h->options & HASH_OWN_KEYS)) | |
308 if (hi->key) | |
309 free(hi->key); | |
310 | |
311 if (deallocate_data && hi->data.p) | |
312 free(hi->data.p); | |
313 | |
314 if (h->options & HASH_POOL_ITEMS) { | |
315 pool_free(h->hi_pool, hi); | |
316 } else { | |
317 free(hi); | |
318 } | |
319 | |
320 h->nused--; | |
321 } | |
322 | |
323 /* | |
324 * Creates a new HashTable object. Size will be rounded up to the next | |
325 * power of 2. It is a starting point and hash tables may be grown or shrunk | |
326 * as needed (if HASH_DYNAMIC_SIZE is used). | |
327 * | |
328 * If HASH_POOL_ITEMS is used, HashItems will be allocated in blocks to reduce | |
329 * malloc overhead in the case where a large number of items is required. | |
330 * HashItems allocated this way will be put on a free list when destroyed; the | |
331 * memory will only be reclaimed when the entire hash table is destroyed. | |
332 * | |
333 * Options are as defined in the header file (see HASH_* macros). | |
334 * | |
335 * Returns: | |
336 * A pointer to a HashTable on success | |
337 * NULL on failure | |
338 */ | |
339 HashTable *HashTableCreate(int size, int options) { | |
340 HashTable *h; | |
341 int i, bits; | |
342 uint32_t mask; | |
343 | |
344 if (!(h = (HashTable *)malloc(sizeof(*h)))) | |
345 return NULL; | |
346 | |
347 if (options & HASH_POOL_ITEMS) { | |
348 h->hi_pool = pool_create(sizeof(HashItem)); | |
349 if (NULL == h->hi_pool) { | |
350 free(h); | |
351 return NULL; | |
352 } | |
353 } else { | |
354 h->hi_pool = NULL; | |
355 } | |
356 | |
357 if (size < 4) | |
358 size = 4; /* an inconsequential minimum size */ | |
359 | |
360 /* Round the requested size to the next power of 2 */ | |
361 bits = 0; | |
362 size--; | |
363 while (size) { | |
364 size /= 2; | |
365 bits++; | |
366 } | |
367 size = 1<<bits; | |
368 mask = size-1; | |
369 | |
370 h->nbuckets = size; | |
371 h->mask = mask; | |
372 h->options = options; | |
373 h->nused = 0; | |
374 h->bucket = (HashItem **)malloc(sizeof(*h->bucket) * size); | |
375 if (NULL == h->bucket) { | |
376 HashTableDestroy(h, 0); | |
377 return NULL; | |
378 } | |
379 | |
380 for (i = 0; i < size; i++) { | |
381 h->bucket[i] = NULL; | |
382 } | |
383 | |
384 return h; | |
385 } | |
386 | |
387 /* | |
388 * Deallocates a HashTable object (created by HashTableCreate). | |
389 * | |
390 * The deallocate_data parameter is a boolean to indicate whether the | |
391 * data attached to the hash table should also be free()d. DO NOT USE | |
392 * this if the HashData attached was not a pointer allocated using | |
393 * malloc(). | |
394 */ | |
395 void HashTableDestroy(HashTable *h, int deallocate_data) { | |
396 int i; | |
397 | |
398 if (!h) | |
399 return; | |
400 | |
401 if (h->bucket) { | |
402 for (i = 0; i < h->nbuckets; i++) { | |
403 HashItem *hi = h->bucket[i], *next = NULL; | |
404 for (hi = h->bucket[i]; hi; hi = next) { | |
405 next = hi->next; | |
406 HashItemDestroy(h, hi, deallocate_data); | |
407 } | |
408 } | |
409 | |
410 free(h->bucket); | |
411 } | |
412 | |
413 if (h->hi_pool) pool_destroy(h->hi_pool); | |
414 | |
415 free(h); | |
416 } | |
417 | |
418 /* | |
419 * Resizes a HashTable to have 'newsize' buckets. | |
420 * This is called automatically when adding or removing items so that the | |
421 * hash table keeps at a sensible scale. | |
422 * | |
423 * FIXME: Halving the size of the hash table is simply a matter of coaelescing | |
424 * every other bucket. Instead we currently rehash (which is slower). | |
425 * Doubling the size of the hash table currently requires rehashing, but this | |
426 * too could be optimised by storing the full 32-bit hash of the key along | |
427 * with the key itself. This then means that it's just a matter of seeing what | |
428 * the next significant bit is. It's a memory vs speed tradeoff though and | |
429 * re-hashing is pretty quick. | |
430 * | |
431 * Returns 0 for success | |
432 * -1 for failure | |
433 */ | |
434 int HashTableResize(HashTable *h, int newsize) { | |
435 HashTable *h2; | |
436 int i; | |
437 | |
438 /* fprintf(stderr, "Resizing to %d\n", newsize); */ | |
439 | |
440 /* Create a new hash table and rehash everything into it */ | |
441 h2 = HashTableCreate(newsize, h->options); | |
442 | |
443 for (i = 0; i < h->nbuckets; i++) { | |
444 HashItem *hi, *next; | |
445 for (hi = h->bucket[i]; hi; hi = next) { | |
446 uint64_t hv = hash64(h2->options & HASH_FUNC_MASK, | |
447 (uint8_t *)hi->key, hi->key_len) & h2->mask; | |
448 next = hi->next; | |
449 hi->next = h2->bucket[hv]; | |
450 h2->bucket[hv] = hi; | |
451 } | |
452 } | |
453 | |
454 /* Swap the links over & free */ | |
455 free(h->bucket); | |
456 h->bucket = h2->bucket; | |
457 h->nbuckets = h2->nbuckets; | |
458 h->mask = h2->mask; | |
459 free(h2); | |
460 | |
461 return 0; | |
462 } | |
463 | |
464 /* | |
465 * Adds a HashData item to HashTable h with a specific key. Key can be binary | |
466 * data, but if key_len is passed as zero then strlen() will be used to | |
467 * determine the key length. | |
468 * | |
469 * The "new" pointer may be passed as NULL. When not NULL it is filled out | |
470 * as a boolean to indicate whether the key is already in this hash table. | |
471 * | |
472 * The HASH_ALLOW_DUP_KEYS option (specified when using HashTableCreate) | |
473 * will allow duplicate keys to be stored, and hence *new is also zero. | |
474 * By default duplicate keys are disallowed. | |
475 * | |
476 * Keys are considered to be volatile memory (ie temporary storage) and so the | |
477 * hash table takes separate copies of them. To avoid this use the | |
478 * HASH_NONVOLATILE_KEYS option. | |
479 * | |
480 * If the HASH_OWN_KEYS option was specified when creating the table then | |
481 * keys will be considered to be owned by the hash table. In this case | |
482 * the key will be freed when the table is destroyed regardless of | |
483 * whether the HASH_NONVOLATILE_KEYS option was used to allocate its | |
484 * own private copy. | |
485 * | |
486 * Returns: | |
487 * The HashItem created (or matching if a duplicate) on success | |
488 * NULL on failure | |
489 */ | |
490 HashItem *HashTableAdd(HashTable *h, char *key, int key_len, HashData data, | |
491 int *new) { | |
492 uint64_t hv; | |
493 HashItem *hi; | |
494 | |
495 if (!key_len) | |
496 key_len = strlen(key); | |
497 | |
498 hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) | |
499 & h->mask; | |
500 | |
501 /* Already exists? */ | |
502 if (!(h->options & HASH_ALLOW_DUP_KEYS)) { | |
503 for (hi = h->bucket[hv]; hi; hi = hi->next) { | |
504 if (key_len == hi->key_len && | |
505 memcmp(key, hi->key, key_len) == 0) { | |
506 if (new) *new = 0; | |
507 return hi; | |
508 } | |
509 } | |
510 } | |
511 | |
512 /* No, so create a new one and link it in */ | |
513 if (NULL == (hi = HashItemCreate(h))) | |
514 return NULL; | |
515 | |
516 if (h->options & HASH_NONVOLATILE_KEYS) | |
517 hi->key = key; | |
518 else { | |
519 hi->key = (char *)malloc(key_len+1); | |
520 memcpy(hi->key, key, key_len); | |
521 hi->key[key_len] = 0; /* null terminate incase others print keys */ | |
522 } | |
523 hi->key_len = key_len; | |
524 hi->data = data; | |
525 hi->next = h->bucket[hv]; | |
526 h->bucket[hv] = hi; | |
527 | |
528 if ((h->options & HASH_DYNAMIC_SIZE) && | |
529 h->nused > HASH_TABLE_RESIZE * h->nbuckets) | |
530 HashTableResize(h, h->nbuckets*4); | |
531 | |
532 if (new) *new = 1; | |
533 | |
534 return hi; | |
535 } | |
536 | |
537 | |
538 /* | |
539 * Removes a specified HashItem from the HashTable. (To perform this it needs | |
540 * to rehash based on the hash key as hash_item only has a next pointer and | |
541 * not a previous pointer.) | |
542 * | |
543 * The HashItem itself is also destroyed (by an internal call to | |
544 * HashItemDestroy). The deallocate_data parameter controls whether the data | |
545 * associated with the HashItem should also be free()d. | |
546 * | |
547 * See also the HashTableRemove() function to remove by key instead of | |
548 * HashItem. | |
549 * | |
550 * Returns 0 on success | |
551 * -1 on failure (eg HashItem not in the HashTable); | |
552 */ | |
553 int HashTableDel(HashTable *h, HashItem *hi, int deallocate_data) { | |
554 uint64_t hv; | |
555 HashItem *next, *last; | |
556 | |
557 hv = hash64(h->options & HASH_FUNC_MASK, | |
558 (uint8_t *)hi->key, hi->key_len) & h->mask; | |
559 | |
560 for (last = NULL, next = h->bucket[hv]; next; | |
561 last = next, next = next->next) { | |
562 if (next == hi) { | |
563 /* Link last to next->next */ | |
564 if (last) | |
565 last->next = next->next; | |
566 else | |
567 h->bucket[hv] = next->next; | |
568 | |
569 HashItemDestroy(h, hi, deallocate_data); | |
570 | |
571 return 0; | |
572 } | |
573 } | |
574 | |
575 return -1; | |
576 } | |
577 | |
578 | |
579 /* | |
580 * Searches the HashTable for the data registered with 'key' and removes | |
581 * these items from the HashTable. In essence this is a combination of | |
582 * HashTableSearch and HashTableDel functions. | |
583 * | |
584 * If HASH_ALLOW_DUP_KEYS is used this will remove all items matching 'key', | |
585 * otherwise just a single item will be removed. | |
586 * | |
587 * If 'deallocate_data' is true the data associated with the HashItem will | |
588 * be free()d. | |
589 * | |
590 * Returns | |
591 * 0 on success (at least one item found) | |
592 * -1 on failure (no items found). | |
593 */ | |
594 int HashTableRemove(HashTable *h, char *key, int key_len, | |
595 int deallocate_data) { | |
596 uint64_t hv; | |
597 HashItem *last, *next, *hi; | |
598 int retval = -1; | |
599 | |
600 if (!key_len) | |
601 key_len = strlen(key); | |
602 | |
603 hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) | |
604 & h->mask; | |
605 | |
606 last = NULL; | |
607 next = h->bucket[hv]; | |
608 | |
609 while (next) { | |
610 hi = next; | |
611 if (key_len == hi->key_len && | |
612 memcmp(key, hi->key, key_len) == 0) { | |
613 /* An item to remove, adjust links and destroy */ | |
614 if (last) | |
615 last->next = hi->next; | |
616 else | |
617 h->bucket[hv] = hi->next; | |
618 | |
619 next = hi->next; | |
620 HashItemDestroy(h, hi, deallocate_data); | |
621 | |
622 retval = 0; | |
623 if (!(h->options & HASH_ALLOW_DUP_KEYS)) | |
624 break; | |
625 | |
626 } else { | |
627 /* We only update last when it's something we haven't destroyed */ | |
628 last = hi; | |
629 next = hi->next; | |
630 } | |
631 } | |
632 | |
633 return retval; | |
634 } | |
635 | |
636 /* | |
637 * Searches the HashTable for the data registered with 'key'. | |
638 * If HASH_ALLOW_DUP_KEYS is used this will just be the first one found. | |
639 * You will then need to use HashTableNext to iterate through the matches. | |
640 * | |
641 * Returns | |
642 * HashItem if found | |
643 * NULL if not found | |
644 */ | |
645 HashItem *HashTableSearch(HashTable *h, char *key, int key_len) { | |
646 uint64_t hv; | |
647 HashItem *hi; | |
648 | |
649 if (!key_len) | |
650 key_len = strlen(key); | |
651 | |
652 hv = hash64(h->options & HASH_FUNC_MASK, (uint8_t *)key, key_len) | |
653 & h->mask; | |
654 for (hi = h->bucket[hv]; hi; hi = hi->next) { | |
655 if (key_len == hi->key_len && | |
656 memcmp(key, hi->key, key_len) == 0) | |
657 return hi; | |
658 } | |
659 | |
660 return NULL; | |
661 } | |
662 | |
663 /* | |
664 * Find the next HashItem (starting from 'hi') to also match this key. | |
665 * This is only valid when the HASH_ALLOW_DUP_KEYS is in use. | |
666 * | |
667 * Returns | |
668 * HashItem if found | |
669 * NULL if not found | |
670 */ | |
671 HashItem *HashTableNext(HashItem *hi, char *key, int key_len) { | |
672 if (!hi) | |
673 return NULL; | |
674 | |
675 for (hi = hi->next; hi; hi = hi->next) { | |
676 if (key_len == hi->key_len && | |
677 memcmp(key, hi->key, key_len) == 0) | |
678 return hi; | |
679 } | |
680 | |
681 return NULL; | |
682 } | |
683 | |
684 /* | |
685 * Dumps a textual represenation of the hash table to stdout. | |
686 */ | |
687 void HashTableDump(HashTable *h, FILE *fp) { | |
688 int i; | |
689 for (i = 0; i < h->nbuckets; i++) { | |
690 HashItem *hi; | |
691 for (hi = h->bucket[i]; hi; hi = hi->next) { | |
692 fprintf(fp, "%.*s\n", hi->key_len, hi->key); | |
693 } | |
694 } | |
695 } | |
696 | |
697 /* | |
698 * Produces some simple statistics on the hash table population. | |
699 */ | |
700 void HashTableStats(HashTable *h, FILE *fp) { | |
701 int i; | |
702 double avg = (double)h->nused / h->nbuckets; | |
703 double var = 0; | |
704 int maxlen = 0; | |
705 int filled = 0; | |
706 int clen[51]; | |
707 | |
708 for (i = 0; i <= 50; i++) | |
709 clen[i] = 0; | |
710 | |
711 for (i = 0; i < h->nbuckets; i++) { | |
712 int len = 0; | |
713 HashItem *hi; | |
714 for (hi = h->bucket[i]; hi; hi = hi->next) { | |
715 len++; | |
716 } | |
717 if (len > 0) { | |
718 filled++; | |
719 if (len > maxlen) | |
720 maxlen = len; | |
721 } | |
722 clen[len <= 50 ? len : 50]++; | |
723 var += (len-avg) * (len-avg); | |
724 } | |
725 var /= h->nbuckets; | |
726 /* sd = sqrt(var); */ | |
727 | |
728 fprintf(fp, "Nbuckets = %d\n", h->nbuckets); | |
729 fprintf(fp, "Nused = %d\n", h->nused); | |
730 fprintf(fp, "Avg chain = %f\n", avg); | |
731 fprintf(fp, "Chain var.= %f\n", var); | |
732 fprintf(fp, "%%age full = %f\n", (100.0*filled)/h->nbuckets); | |
733 fprintf(fp, "max len = %d\n", maxlen); | |
734 for (i = 0; i <= maxlen; i++) { | |
735 fprintf(fp, "Chain %2d = %d\n", i, clen[i]); | |
736 } | |
737 } | |
738 | |
739 /* | |
740 * -------------------------------------------------------------------- | |
741 * Below we have a specialisation of the HashTable code where the data | |
742 * attached to the hash table is a position,size pair. This allows for the | |
743 * hash table to encode positions and sizes of items within a file archive. | |
744 * -------------------------------------------------------------------- | |
745 */ | |
746 | |
747 /* | |
748 * Writes the HashTable structures to 'fp'. | |
749 * This is a specialisation of the HashTable where the HashData is a | |
750 * position,size tuple. | |
751 * | |
752 * This consists of the following format: | |
753 * Header: | |
754 * ".hsh" (magic numebr) | |
755 * x4 (1-bytes of version code, eg "1.00") | |
756 * x1 (HASH_FUNC_? function used) | |
757 * x1 (number of file headers: FH. These count from 1 to FH inclusive) | |
758 * x1 (number of file footers: FF. These count from 1 to FF inclusive) | |
759 * x1 (reserved - zero for now) | |
760 * x4 (4-bytes big-endian; number of hash buckets) | |
761 * x8 (offset to add to item positions. eg size of this index) | |
762 * x4 (4-bytes big-endian; number of bytes in hash file, inc. header) | |
763 * Archive name: | |
764 * x1 (length, zero => no name) | |
765 * ? (archive filename) | |
766 * File headers (FH copies of): | |
767 * x8 (position) | |
768 * x4 (size) | |
769 * File footers (FH copies of): | |
770 * x8 (position) | |
771 * x4 (size) | |
772 * Buckets (multiples of) | |
773 * x4 (4-byte offset of linked list pos, rel. to the start of the hdr) | |
774 * Items (per bucket chain, not written if Bucket[?]==0) | |
775 * x1 (key length, zero => end of chain) | |
776 * ? (key) | |
777 * x0.5 (File header to use. zero => none) top 4 bits | |
778 * x0.5 (File footer to use. zero => none) bottom 4 bits | |
779 * x8 (position) | |
780 * x4 (size) | |
781 * ... arbitrary gap (but likely none) | |
782 * Index footer: | |
783 * ".hsh" (magic number) | |
784 * x8 (offset to Hash Header. +ve = absolute, -ve = relative to end) | |
785 * | |
786 * It is designed such that on-disk querying of the hash table can be done | |
787 * purely by forward seeks. (This is generally faster due to pre-fetching of | |
788 * the subsequent blocks by many disk controllers.) | |
789 * | |
790 * Returns: the number of bytes written on success | |
791 * -1 for error | |
792 */ | |
793 uint64_t HashFileSave(HashFile *hf, FILE *fp, int64_t offset) { | |
794 int i; | |
795 HashItem *hi; | |
796 uint32_t *bucket_pos; | |
797 uint64_t hfsize = 0, be_hfsize; | |
798 HashTable *h = hf->h; | |
799 HashFileFooter foot; | |
800 | |
801 /* Compute the coordinates of the hash items */ | |
802 hfsize = HHSIZE; /* header */ | |
803 hfsize += 1 + (hf->archive | |
804 ? strlen(hf->archive) | |
805 : 0); /* filename */ | |
806 hfsize += h->nbuckets * 4; /* buckets */ | |
807 for (i = 0; i < hf->nheaders; i++) /* headers */ | |
808 hfsize += 12; | |
809 for (i = 0; i < hf->nfooters; i++) /* footers */ | |
810 hfsize += 12; | |
811 bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t)); | |
812 for (i = 0; i < h->nbuckets; i++) { | |
813 bucket_pos[i] = hfsize; | |
814 | |
815 if (!(hi = h->bucket[i])) | |
816 continue; | |
817 for (; hi; hi = hi->next) { | |
818 hfsize += 1 + 1 + hi->key_len + 8 + 4; /* keys, pos, size */ | |
819 } | |
820 hfsize++; /* list-end marker */ | |
821 } | |
822 hfsize += sizeof(foot); | |
823 | |
824 /* Write the header: */ | |
825 memcpy(hf->hh.magic, HASHFILE_MAGIC, 4); | |
826 memcpy(hf->hh.vers, HASHFILE_VERSION, 4); | |
827 hf->hh.hfunc = h->options & HASH_FUNC_MASK; | |
828 hf->hh.nheaders = hf->nheaders; | |
829 hf->hh.nfooters = hf->nfooters; | |
830 hf->hh.nbuckets = be_int4(h->nbuckets); | |
831 hf->hh.offset = offset == HASHFILE_PREPEND | |
832 ? be_int8(hfsize) /* archive will be append to this file */ | |
833 : be_int8(offset); | |
834 hf->hh.size = be_int4(hfsize); | |
835 fwrite(&hf->hh, HHSIZE, 1, fp); | |
836 | |
837 /* Write the archive filename, if known */ | |
838 if (hf->archive && *hf->archive) { | |
839 fputc(strlen(hf->archive), fp); | |
840 fputs(hf->archive, fp); | |
841 } else { | |
842 fputc(0, fp); | |
843 } | |
844 | |
845 /* Write out the headers and footers */ | |
846 for (i = 0; i < hf->nheaders; i++) { | |
847 HashFileSection hs; | |
848 hs.pos = be_int8(hf->headers[i].pos); | |
849 fwrite(&hs.pos, 8, 1, fp); | |
850 hs.size = be_int4(hf->headers[i].size); | |
851 fwrite(&hs.size, 4, 1, fp); | |
852 } | |
853 | |
854 for (i = 0; i < hf->nfooters; i++) { | |
855 HashFileSection hs; | |
856 hs.pos = be_int8(hf->footers[i].pos); | |
857 fwrite(&hs.pos, 8, 1, fp); | |
858 hs.size = be_int4(hf->footers[i].size); | |
859 fwrite(&hs.size, 4, 1, fp); | |
860 } | |
861 | |
862 /* Write out hash buckets */ | |
863 for (i = 0; i < h->nbuckets; i++) { | |
864 uint32_t zero = 0; | |
865 uint32_t be32; | |
866 | |
867 if (!(hi = h->bucket[i])) { | |
868 fwrite(&zero, 4, 1, fp); | |
869 continue; | |
870 } | |
871 | |
872 be32 = be_int4(bucket_pos[i]); | |
873 fwrite(&be32, 4, 1, fp); | |
874 } | |
875 free(bucket_pos); | |
876 | |
877 /* | |
878 * Write the hash_item linked lists. The first item is the | |
879 * hash key length. We append a zero to the end of the list so we | |
880 * can check this key length to determine the end of this hash | |
881 * item list. | |
882 */ | |
883 for (i = 0; i < h->nbuckets; i++) { | |
884 if (!(hi = h->bucket[i])) | |
885 continue; | |
886 for (; hi; hi = hi->next) { | |
887 uint64_t be64; | |
888 uint32_t be32; | |
889 HashFileItem *hfi = (HashFileItem *)hi->data.p; | |
890 unsigned char headfoot = 0; | |
891 | |
892 fprintf(fp, "%c%.*s", hi->key_len, | |
893 hi->key_len, hi->key); | |
894 headfoot = (((hfi->header) & 0xf) << 4) | ((hfi->footer) & 0xf); | |
895 fwrite(&headfoot, 1, 1, fp); | |
896 be64 = be_int8(hfi->pos); | |
897 fwrite(&be64, 8, 1, fp); | |
898 be32 = be_int4(hfi->size); | |
899 fwrite(&be32, 4, 1, fp); | |
900 } | |
901 fputc(0, fp); | |
902 } | |
903 | |
904 /* Finally write the footer referencing back to the header start */ | |
905 memcpy(foot.magic, HASHFILE_MAGIC, 4); | |
906 be_hfsize = be_int8(-hfsize); | |
907 memcpy(foot.offset, &be_hfsize, 8); | |
908 fwrite(&foot, sizeof(foot), 1, fp); | |
909 | |
910 return hfsize; | |
911 } | |
912 | |
913 /* | |
914 * Reads an entire HashTable from fp. | |
915 * | |
916 * Returns: | |
917 * A filled out HashTable pointer on success | |
918 * NULL on failure | |
919 */ | |
920 HashFile *HashFileLoad_old(FILE *fp) { | |
921 int i; | |
922 HashTable *h; | |
923 HashItem *hi; | |
924 HashFile *hf; | |
925 uint32_t *bucket_pos; | |
926 unsigned char *htable; | |
927 int htable_pos; | |
928 int fnamelen; | |
929 | |
930 if (NULL == (hf = (HashFile *)calloc(1, sizeof(HashFile)))) | |
931 return NULL; | |
932 if (NULL == (htable = (unsigned char *)malloc(HHSIZE))) | |
933 return NULL; | |
934 | |
935 /* Read and create the hash table header */ | |
936 if (HHSIZE != fread(htable, 1, HHSIZE, fp)) | |
937 return NULL; | |
938 memcpy(&hf->hh, htable, HHSIZE); | |
939 hf->hh.nbuckets = be_int4(hf->hh.nbuckets); | |
940 hf->hh.offset = be_int8(hf->hh.offset); | |
941 hf->hh.size = be_int4(hf->hh.size); | |
942 hf->h = h = HashTableCreate(hf->hh.nbuckets, hf->hh.hfunc); | |
943 bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t)); | |
944 | |
945 /* Load the archive filename */ | |
946 fnamelen = fgetc(fp); | |
947 if (fnamelen) { | |
948 hf->archive = (char *)malloc(fnamelen+1); | |
949 fread(hf->archive, 1, fnamelen, fp); | |
950 hf->archive[fnamelen] = 0; | |
951 if (hf->archive[0] == 0) { | |
952 free(hf->archive); | |
953 hf->archive = NULL; | |
954 } | |
955 } | |
956 | |
957 /* Load the rest of the hash table to memory */ | |
958 htable_pos = HHSIZE + fnamelen + 1; | |
959 if (NULL == (htable = (unsigned char *)realloc(htable, hf->hh.size))) | |
960 return NULL; | |
961 if (hf->hh.size-htable_pos != | |
962 fread(&htable[htable_pos], 1, hf->hh.size-htable_pos, fp)) | |
963 return NULL; | |
964 | |
965 /* Read the header / footer items */ | |
966 for (i = 0; i < hf->hh.nheaders; i++) | |
967 htable_pos += 8; /* skip them for now */ | |
968 for (i = 0; i < hf->hh.nfooters; i++) | |
969 htable_pos += 8; /* skip them for now */ | |
970 | |
971 /* Identify the "bucket pos". Detemines which buckets have data */ | |
972 for (i = 0; i < h->nbuckets; i++) { | |
973 memcpy(&bucket_pos[i], &htable[htable_pos], 4); | |
974 bucket_pos[i] = be_int4(bucket_pos[i]); | |
975 htable_pos += 4; | |
976 } | |
977 | |
978 /* Read the hash table items */ | |
979 for (i = 0; i < h->nbuckets; i++) { | |
980 if (!bucket_pos[i]) | |
981 continue; | |
982 for (;;) { | |
983 int c; | |
984 unsigned char uc; | |
985 char key[256]; | |
986 uint64_t pos; | |
987 uint32_t size; | |
988 HashFileItem *hfi; | |
989 | |
990 c = htable[htable_pos++]; | |
991 if (c == EOF || !c) | |
992 break; | |
993 | |
994 /* key */ | |
995 memcpy(key, &htable[htable_pos], c); | |
996 htable_pos += c; | |
997 | |
998 /* header/footer */ | |
999 uc = htable[htable_pos++]; | |
1000 hfi = (HashFileItem *)malloc(sizeof(*hfi)); | |
1001 hfi->header = (uc >> 4) & 0xf; | |
1002 hfi->footer = uc & 0xf; | |
1003 | |
1004 /* pos */ | |
1005 memcpy(&pos, &htable[htable_pos], 8); | |
1006 htable_pos += 8; | |
1007 hfi->pos = be_int8(pos) + hf->hh.offset; | |
1008 | |
1009 /* size */ | |
1010 memcpy(&size, &htable[htable_pos], 4); | |
1011 htable_pos += 4; | |
1012 hfi->size = be_int4(size); | |
1013 | |
1014 /* Add to the hash table */ | |
1015 hi = HashItemCreate(h); | |
1016 hi->next = h->bucket[i]; | |
1017 h->bucket[i] = hi; | |
1018 hi->key_len = c; | |
1019 hi->key = (char *)malloc(c+1); | |
1020 memcpy(hi->key, key, c); | |
1021 hi->key[c] = 0; /* For debugging convenience only */ | |
1022 hi->data.p = hfi; | |
1023 } | |
1024 } | |
1025 | |
1026 fprintf(stderr, "done\n"); | |
1027 fflush(stderr); | |
1028 free(bucket_pos); | |
1029 | |
1030 return hf; | |
1031 } | |
1032 | |
1033 /* | |
1034 * Opens a stored hash table file. It also internally keeps an open file to | |
1035 * hash and the archive files. | |
1036 * | |
1037 * Returns the HashFile pointer on success | |
1038 * NULL on failure | |
1039 */ | |
1040 HashFile *HashFileFopen(FILE *fp) { | |
1041 HashFile *hf = HashFileCreate(0, 0); | |
1042 int archive_len; | |
1043 int i; | |
1044 | |
1045 /* Set the stdio buffer to be small to avoid massive I/O wastage */ | |
1046 | |
1047 /* Read the header */ | |
1048 hf->hfp = fp; | |
1049 hf->afp = fp; | |
1050 hf->hf_start = ftello(hf->hfp); | |
1051 | |
1052 if (HHSIZE != fread(&hf->hh, 1, HHSIZE, hf->hfp)) { | |
1053 HashFileDestroy(hf); | |
1054 return NULL; | |
1055 } | |
1056 if (memcmp(HASHFILE_MAGIC, &hf->hh, 4) != 0) { | |
1057 HashFileFooter foot; | |
1058 int64_t offset; | |
1059 | |
1060 /* Invalid magic number, try other end of file! */ | |
1061 fseeko(hf->hfp, -(off_t)sizeof(HashFileFooter), SEEK_END); | |
1062 if (sizeof(foot) != fread(&foot, 1, sizeof(foot), hf->hfp)) { | |
1063 HashFileDestroy(hf); | |
1064 return NULL; | |
1065 } | |
1066 if (memcmp(HASHFILE_MAGIC, &foot.magic, 4) != 0) { | |
1067 HashFileDestroy(hf); | |
1068 return NULL; | |
1069 } | |
1070 memcpy(&offset, foot.offset, 8); | |
1071 offset = be_int8(offset); | |
1072 fseeko(hf->hfp, offset, SEEK_CUR); | |
1073 hf->hf_start = ftello(hf->hfp); | |
1074 if (HHSIZE != fread(&hf->hh, 1, HHSIZE, hf->hfp)) { | |
1075 HashFileDestroy(hf); | |
1076 return NULL; | |
1077 } | |
1078 } | |
1079 if (memcmp(hf->hh.vers, HASHFILE_VERSION, 4) != 0) { | |
1080 /* incorrect version */ | |
1081 HashFileDestroy(hf); | |
1082 return NULL; | |
1083 } | |
1084 | |
1085 hf->hh.nbuckets = be_int4(hf->hh.nbuckets); | |
1086 hf->hh.offset = be_int8(hf->hh.offset); | |
1087 hf->hh.size = be_int4(hf->hh.size); | |
1088 | |
1089 /* Load the main archive filename */ | |
1090 if ((archive_len = fgetc(hf->hfp))) { | |
1091 hf->archive = (char *)malloc(archive_len+1); | |
1092 fread(hf->archive, 1, archive_len, hf->hfp); | |
1093 hf->archive[archive_len] = 0; | |
1094 if (hf->archive[0] == 0) { | |
1095 free(hf->archive); | |
1096 hf->archive = NULL; | |
1097 } | |
1098 } | |
1099 | |
1100 hf->header_size = HHSIZE + 1 + archive_len + | |
1101 12 * (hf->hh.nheaders + hf->hh.nfooters); | |
1102 hf->nheaders = hf->hh.nheaders; | |
1103 hf->nfooters = hf->hh.nfooters; | |
1104 | |
1105 /* Load the header and footer locations */ | |
1106 hf->headers = hf->nheaders | |
1107 ? (HashFileSection *)malloc(hf->nheaders * sizeof(HashFileSection)) | |
1108 : NULL; | |
1109 for (i = 0; i < hf->nheaders; i++) { | |
1110 fread(&hf->headers[i].pos, 8, 1, hf->hfp); | |
1111 fread(&hf->headers[i].size, 4, 1, hf->hfp); | |
1112 hf->headers[i].pos = be_int8(hf->headers[i].pos) + hf->hh.offset; | |
1113 hf->headers[i].size = be_int4(hf->headers[i].size); | |
1114 hf->headers[i].cached_data = NULL; | |
1115 } | |
1116 | |
1117 hf->footers = hf->nfooters | |
1118 ? (HashFileSection *)malloc(hf->nfooters * sizeof(HashFileSection)) | |
1119 : NULL; | |
1120 for (i = 0; i < hf->nfooters; i++) { | |
1121 fread(&hf->footers[i].pos, 8, 1, hf->hfp); | |
1122 fread(&hf->footers[i].size, 4, 1, hf->hfp); | |
1123 hf->footers[i].pos = be_int8(hf->footers[i].pos) + hf->hh.offset; | |
1124 hf->footers[i].size = be_int4(hf->footers[i].size); | |
1125 hf->footers[i].cached_data = NULL; | |
1126 } | |
1127 | |
1128 return hf; | |
1129 } | |
1130 | |
1131 HashFile *HashFileOpen(char *fname) { | |
1132 FILE *fp; | |
1133 HashFile *hf; | |
1134 | |
1135 /* Open the hash and read the header */ | |
1136 if (NULL == (fp = fopen(fname, "rb"))) | |
1137 return NULL; | |
1138 | |
1139 if (!(hf = HashFileFopen(fp))) | |
1140 return NULL; | |
1141 | |
1142 /* Open the main archive too */ | |
1143 if (hf->archive) { | |
1144 if (NULL == (hf->afp = fopen(hf->archive, "rb"))) { | |
1145 /* Possibly done via a relative pathname (optimal infact) */ | |
1146 char *cp; | |
1147 char aname[1024]; | |
1148 if (NULL == (cp = strrchr(fname, '/'))) { | |
1149 HashFileDestroy(hf); | |
1150 return NULL; | |
1151 } | |
1152 sprintf(aname, "%.*s%s", (int)(cp-fname+1), fname, hf->archive); | |
1153 if (NULL == (hf->afp = fopen(aname, "rb"))) { | |
1154 | |
1155 return NULL; | |
1156 } | |
1157 } | |
1158 } else { | |
1159 hf->afp = hf->hfp; | |
1160 } | |
1161 | |
1162 return hf; | |
1163 } | |
1164 | |
1165 HashFile *HashFileLoad(FILE *fp) { | |
1166 HashFile *hf; | |
1167 char *htable; | |
1168 off_t htable_pos; | |
1169 int i; | |
1170 HashItem *hi; | |
1171 HashTable *h; | |
1172 uint32_t *bucket_pos; | |
1173 uint32_t hsize; | |
1174 | |
1175 /* Open the hash table */ | |
1176 fseeko(fp, 0, SEEK_SET); | |
1177 if (NULL == (hf = HashFileFopen(fp))) | |
1178 return NULL; | |
1179 | |
1180 HashTableDestroy(hf->h, 1); | |
1181 h = hf->h = HashTableCreate(hf->hh.nbuckets, hf->hh.hfunc); | |
1182 bucket_pos = (uint32_t *)calloc(h->nbuckets, sizeof(uint32_t)); | |
1183 | |
1184 /* Also load in the entire thing to memory */ | |
1185 htable = (char *)malloc(hf->hh.size); | |
1186 fseeko(fp, hf->hf_start, SEEK_SET); | |
1187 hsize = fread(htable, 1, hf->hh.size, fp); | |
1188 if (hf->hh.size != hsize) { | |
1189 free(htable); | |
1190 return NULL; | |
1191 } | |
1192 | |
1193 /* | |
1194 * HashFileOpen has already decoded the headers up to and including the | |
1195 * individual file header/footer sections, but not the buckets and item | |
1196 * lists, so we start from there. | |
1197 */ | |
1198 htable_pos = hf->header_size; | |
1199 | |
1200 /* Identify the "bucket pos". Detemines which buckets have data */ | |
1201 for (i = 0; i < h->nbuckets; i++) { | |
1202 memcpy(&bucket_pos[i], &htable[htable_pos], 4); | |
1203 bucket_pos[i] = be_int4(bucket_pos[i]); | |
1204 htable_pos += 4; | |
1205 } | |
1206 | |
1207 /* Read the hash table items */ | |
1208 for (i = 0; i < h->nbuckets; i++) { | |
1209 if (!bucket_pos[i]) | |
1210 continue; | |
1211 for (;;) { | |
1212 int c; | |
1213 unsigned char uc; | |
1214 char key[256]; | |
1215 uint64_t pos; | |
1216 uint32_t size; | |
1217 HashFileItem *hfi; | |
1218 | |
1219 c = htable[htable_pos++]; | |
1220 if (c == EOF || !c) | |
1221 break; | |
1222 | |
1223 /* key */ | |
1224 memcpy(key, &htable[htable_pos], c); | |
1225 htable_pos += c; | |
1226 | |
1227 /* header/footer */ | |
1228 uc = htable[htable_pos++]; | |
1229 hfi = (HashFileItem *)malloc(sizeof(*hfi)); | |
1230 hfi->header = (uc >> 4) & 0xf; | |
1231 hfi->footer = uc & 0xf; | |
1232 | |
1233 /* pos */ | |
1234 memcpy(&pos, &htable[htable_pos], 8); | |
1235 htable_pos += 8; | |
1236 hfi->pos = be_int8(pos) + hf->hh.offset; | |
1237 | |
1238 /* size */ | |
1239 memcpy(&size, &htable[htable_pos], 4); | |
1240 htable_pos += 4; | |
1241 hfi->size = be_int4(size); | |
1242 | |
1243 /* Add to the hash table */ | |
1244 hi = HashItemCreate(h); | |
1245 hi->next = h->bucket[i]; | |
1246 h->bucket[i] = hi; | |
1247 hi->key_len = c; | |
1248 hi->key = (char *)malloc(c+1); | |
1249 memcpy(hi->key, key, c); | |
1250 hi->key[c] = 0; /* For debugging convenience only */ | |
1251 hi->data.p = hfi; | |
1252 } | |
1253 } | |
1254 | |
1255 fflush(stderr); | |
1256 free(bucket_pos); | |
1257 | |
1258 return hf; | |
1259 } | |
1260 | |
1261 /* | |
1262 * Searches the named HashFile for a specific key. | |
1263 * When found it returns the position and size of the object in pos and size. | |
1264 * | |
1265 * Returns | |
1266 * 0 on success (pos & size updated) | |
1267 * -1 on failure | |
1268 */ | |
1269 int HashFileQuery(HashFile *hf, uint8_t *key, int key_len, | |
1270 HashFileItem *item) { | |
1271 uint64_t hval; | |
1272 uint32_t pos; | |
1273 int klen; | |
1274 int cur_offset = 0; | |
1275 | |
1276 /* Hash 'key' to compute the bucket number */ | |
1277 hval = hash64(hf->hh.hfunc, key, key_len) & (hf->hh.nbuckets-1); | |
1278 | |
1279 /* Read the bucket to find the first linked list item location */ | |
1280 if (-1 == fseeko(hf->hfp, hf->hf_start + 4*hval + hf->header_size,SEEK_SET)) | |
1281 return -1; | |
1282 if (4 != fread(&pos, 1, 4, hf->hfp)) | |
1283 return -1; | |
1284 pos = be_int4(pos); | |
1285 cur_offset = 4*hval + 4 + hf->header_size; | |
1286 | |
1287 if (0 == pos) | |
1288 /* No bucket pos => key not present */ | |
1289 return -1; | |
1290 | |
1291 /* Jump to the HashItems list and look through for key */ | |
1292 if (-1 == fseeko(hf->hfp, pos - cur_offset, SEEK_CUR)) | |
1293 return -1; | |
1294 | |
1295 for (klen = fgetc(hf->hfp); klen; klen = fgetc(hf->hfp)) { | |
1296 char k[256]; | |
1297 unsigned char headfoot; | |
1298 uint64_t pos; | |
1299 uint32_t size; | |
1300 | |
1301 fread(k, klen, 1, hf->hfp); | |
1302 fread(&headfoot, 1, 1, hf->hfp); | |
1303 item->header = (headfoot >> 4) & 0xf; | |
1304 item->footer = headfoot & 0xf; | |
1305 fread(&pos, 8, 1, hf->hfp); | |
1306 pos = be_int8(pos) + hf->hh.offset; | |
1307 fread(&size, 4, 1, hf->hfp); | |
1308 size = be_int4(size); | |
1309 if (klen == key_len && 0 == memcmp(key, k, key_len)) { | |
1310 item->pos = pos; | |
1311 item->size = size; | |
1312 return 0; | |
1313 } | |
1314 } | |
1315 | |
1316 return -1; | |
1317 } | |
1318 | |
1319 HashFile *HashFileCreate(int size, int options) { | |
1320 HashFile *hf; | |
1321 | |
1322 if (NULL == (hf = (HashFile *)calloc(1, sizeof(*hf)))) | |
1323 return NULL; | |
1324 if (NULL == (hf->h = HashTableCreate(size, options))) | |
1325 return NULL; | |
1326 | |
1327 return hf; | |
1328 } | |
1329 | |
1330 void HashFileDestroy(HashFile *hf) { | |
1331 if (!hf) | |
1332 return; | |
1333 | |
1334 if (hf->h) | |
1335 HashTableDestroy(hf->h, 1); | |
1336 if (hf->archive) | |
1337 free(hf->archive); | |
1338 if (hf->headers) { | |
1339 int i; | |
1340 for (i = 0; i < hf->nheaders; i++) { | |
1341 if (hf->headers[i].cached_data) | |
1342 free(hf->headers[i].cached_data); | |
1343 } | |
1344 free(hf->headers); | |
1345 } | |
1346 if (hf->footers) { | |
1347 int i; | |
1348 for (i = 0; i < hf->nfooters; i++) { | |
1349 if (hf->footers[i].cached_data) | |
1350 free(hf->footers[i].cached_data); | |
1351 } | |
1352 free(hf->footers); | |
1353 } | |
1354 | |
1355 if (hf->afp) | |
1356 fclose(hf->afp); | |
1357 if (hf->hfp && hf->hfp != hf->afp) | |
1358 fclose(hf->hfp); | |
1359 | |
1360 free(hf); | |
1361 } | |
1362 | |
1363 /* | |
1364 * Extracts the contents for a file out of the HashFile. | |
1365 */ | |
1366 char *HashFileExtract(HashFile *hf, char *fname, size_t *len) { | |
1367 HashFileItem hfi; | |
1368 size_t sz, pos; | |
1369 char *data; | |
1370 HashFileSection *head = NULL, *foot = NULL; | |
1371 | |
1372 /* Find out if and where the item is in the archive */ | |
1373 if (-1 == HashFileQuery(hf, (uint8_t *)fname, strlen(fname), &hfi)) | |
1374 return NULL; | |
1375 | |
1376 /* Work out the size including header/footer and allocate */ | |
1377 sz = hfi.size; | |
1378 if (hfi.header) { | |
1379 head = &hf->headers[hfi.header-1]; | |
1380 sz += head->size; | |
1381 } | |
1382 if (hfi.footer) { | |
1383 foot = &hf->footers[hfi.footer-1]; | |
1384 sz += foot->size; | |
1385 } | |
1386 *len = sz; | |
1387 | |
1388 if (NULL == (data = (char *)malloc(sz+1))) | |
1389 return NULL; | |
1390 data[sz] = 0; | |
1391 | |
1392 /* Header */ | |
1393 pos = 0; | |
1394 if (head) { | |
1395 fseeko(hf->afp, head->pos, SEEK_SET); | |
1396 fread(&data[pos], head->size, 1, hf->afp); | |
1397 pos += head->size; | |
1398 } | |
1399 | |
1400 /* Main file */ | |
1401 fseeko(hf->afp, hfi.pos, SEEK_SET); | |
1402 fread(&data[pos], hfi.size, 1, hf->afp); | |
1403 pos += hfi.size; | |
1404 | |
1405 /* Footer */ | |
1406 if (foot) { | |
1407 fseeko(hf->afp, foot->pos, SEEK_SET); | |
1408 fread(&data[pos], foot->size, 1, hf->afp); | |
1409 pos += foot->size; | |
1410 } | |
1411 | |
1412 return data; | |
1413 } | |
1414 | |
1415 /* | |
1416 * Iterates through members of a hash table returning items sequentially. | |
1417 * | |
1418 * Returns the next HashItem on success | |
1419 * NULL on failure. | |
1420 */ | |
1421 HashItem *HashTableIterNext(HashTable *h, HashIter *iter) { | |
1422 do { | |
1423 if (iter->hi == NULL) { | |
1424 if (++iter->bnum >= h->nbuckets) | |
1425 break; | |
1426 iter->hi = h->bucket[iter->bnum]; | |
1427 } else { | |
1428 iter->hi = iter->hi->next; | |
1429 } | |
1430 } while (!iter->hi); | |
1431 | |
1432 return iter->hi; | |
1433 } | |
1434 | |
1435 void HashTableIterReset(HashIter *iter) { | |
1436 if (iter) { | |
1437 iter->bnum = -1; | |
1438 iter->hi = NULL; | |
1439 } | |
1440 } | |
1441 | |
1442 HashIter *HashTableIterCreate(void) { | |
1443 HashIter *iter = (HashIter *)malloc(sizeof(*iter)); | |
1444 | |
1445 HashTableIterReset(iter); | |
1446 return iter; | |
1447 } | |
1448 | |
1449 void HashTableIterDestroy(HashIter *iter) { | |
1450 if (iter) | |
1451 free(iter); | |
1452 } |