comparison clustalomega/clustal-omega-0.2.0/src/squid/gki.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 /* gki.c
11 * SRE, Sat May 1 14:49:08 1999
12 *
13 * "generic key index" module: emulation of Perl hashes.
14 * Maps keys (ASCII char strings) to array index. Dynamically
15 * resizes the hash table.
16 *
17 * Limitations:
18 * - hash table can only grow; no provision for deleting keys
19 * or downsizing the hash table.
20 * - Maximum hash table size set at 100003. Performance
21 * will degrade for key sets much larger than this.
22 * - Assumes that integers are 32 bits (or greater).
23 *
24 * Defines a typedef'd structure:
25 * gki - a key index hash table.
26 * Provides functions:
27 * GKIInit() - start a hash table.
28 * GKIStoreKey() - store a new key, get a unique index.
29 * GKIKeyIndex() - retrieve an existing key's index.
30 * GKIFree() - free a hash table.
31 * GKIStatus() - Debugging: prints internal status of a hash struct
32 *
33 *
34 * Note that there are no dependencies on squid; the gki.c/gki.h
35 * pair are base ANSI C and can be reused anywhere.
36 *****************************************************************
37 *
38 * API for storing/reading stuff:
39 * moral equivalent of Perl's $foo{$key} = whatever, $bar{$key} = whatever:
40 * #include "gki.h"
41 *
42 * gki *hash;
43 * int idx;
44 * char *key;
45 *
46 * hash = GKIInit();
47 * (Storing:)
48 * (foreach key) {
49 * idx = GKIStoreKey(hash, key);
50 * (reallocate foo, bar as needed)
51 * foo[idx] = whatever;
52 * bar[idx] = whatever;
53 * }
54 * (Reading:)
55 * (foreach key) {
56 * idx = GKIKeyIndex(hash, key);
57 * if (idx == -1) {no_such_key; }
58 * (do something with) foo[idx];
59 * (do something with) bar[idx];
60 * }
61 * GKIFree();
62 *
63 *****************************************************************
64 *
65 * Timings on wrasse for 45402 keys in /usr/dict/words using
66 * Tests/test_gki:
67 * 250 msec store (6 usec/store)
68 * 140 msec retrieve (3 usec/retrieve)
69 * and using the 13408 names of Pfam's GP120.full alignment:
70 * 70 msec store (5 usec/store)
71 * 50 msec retrieve (4 usec/retrieve)
72 *
73 * RCS $Id: gki.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: gki.c,v 1.3 2000/12/21 23:42:59 eddy Exp)
74 */
75
76
77
78 #include <stdio.h>
79 #include <stdlib.h>
80 #include <string.h>
81 #include <limits.h>
82 #include "squid.h"
83 #include "gki.h"
84
85 /*
86 * Best hash table sizes are prime numbers (see Knuth vol 3, Sorting
87 * and Searching).
88 * gki_primes[] defines the ascending order of hash table sizes
89 * that we use in upsizing the hash table dynamically.
90 * useful site for testing primes:
91 * http://www.idbsu.edu/people/jbrennan/algebra/numbers/sieve.html
92 * Because of the way gki_hashvalue works, the largest number
93 * must be < INT_MAX / 128 / 128 : 131072 on a 32 bit machine.
94 */
95 static int gki_primes[] = { 101, 1009, 10007, 100003 };
96 #define GKI_NPRIMES 4
97 #define GKI_ALPHABETSIZE 128
98
99 static GKI *gki_alloc(int primelevel);
100 static int gki_hashvalue(GKI *hash, char *key);
101 static int gki_upsize(GKI *old);
102
103
104 /* Function: GKIInit()
105 * Date: SRE, Sat May 1 11:12:24 1999 [May Day geek-out]
106 *
107 * Purpose: Initialize a hash table for key indexing.
108 * Simply a wrapper around a level 0 gki_alloc().
109 *
110 * Args: (void)
111 *
112 * Returns: An allocated hash table structure.
113 * Caller frees with GKIFree().
114 */
115 GKI *
116 GKIInit(void)
117 {
118 GKI *hash;
119 hash = gki_alloc(0);
120 return hash;
121 }
122
123 /* Function: GKIFree()
124 * Date: SRE, Sat May 1 11:13:26 1999 [May Day geek-out]
125 *
126 * Purpose: Free a key index hash table.
127 *
128 * Args: hash - the gki structure
129 *
130 * Returns: (void).
131 * hash table is destroyed.
132 */
133 void
134 GKIFree(GKI *hash)
135 {
136 struct gki_elem *ptr;
137 int i;
138
139 if (hash == NULL) return; /* tolerate a NULL */
140
141 for (i = 0; i < hash->nhash; i++)
142 while (hash->table[i] != NULL)
143 {
144 ptr = hash->table[i]->nxt;
145 /* NULL keys can occur after we've gki_upsize'd */
146 if (hash->table[i]->key != NULL) free(hash->table[i]->key);
147 free(hash->table[i]);
148 hash->table[i] = ptr;
149 }
150 free(hash->table);
151 free(hash);
152 }
153
154 /* Function: GKIStoreKey()
155 * Date: SRE, Sat May 1 11:16:48 1999 [May Day geek-out]
156 *
157 * Purpose: Store a key in the key index hash table.
158 * Associate it with a unique "key index", counting
159 * from 0. (It's this index that lets us map
160 * the hashed keys to indexed C arrays, (clumsily)
161 * emulating Perl's hashes.)
162 *
163 * Does *not* check to see if the key's already
164 * in the table, so it's possible to store multiple
165 * copies of a key with different indices; probably
166 * not what you want, so if you're not sure the
167 * key is unique, check the table first with
168 * GKIKeyIndex().
169 *
170 * Args: hash - GKI structure to store the key in
171 * key - string to store
172 *
173 * Returns: the new key's index. Since it's always the
174 * last one in the current array, this index is
175 * just hash->nkeys-1.
176 * On a malloc failure, returns -1.
177 * hash table is modified.
178 */
179 int
180 GKIStoreKey(GKI *hash, char *key)
181 {
182 int val;
183 struct gki_elem *ptr;
184
185 val = gki_hashvalue(hash, key);
186
187 ptr = hash->table[val];
188 hash->table[val] = MallocOrDie(sizeof(struct gki_elem));
189 hash->table[val]->key = MallocOrDie(sizeof(char) * (strlen(key)+1));
190 strcpy(hash->table[val]->key, key);
191
192 hash->table[val]->idx = hash->nkeys;
193 hash->table[val]->nxt = ptr;
194
195 hash->nkeys++;
196 /* time to upsize? */
197 if (hash->nkeys > 3*hash->nhash && hash->primelevel < GKI_NPRIMES-1)
198 gki_upsize(hash);
199
200 return hash->nkeys-1;
201 }
202
203 /* Function: GKIKeyIndex()
204 * Date: SRE, Sat May 1 11:20:42 1999 [May Day geek-out]
205 *
206 * Purpose: Look up a key in the hash table. Return
207 * its index (0..nkeys-1), else -1 if the key
208 * isn't in the hash (yet).
209 *
210 * Args: hash - the GKI hash table to search in
211 * key - the key to look up
212 *
213 * Returns: -1 if key is not found;
214 * index of key if it is found (range 0..nkeys-1).
215 * hash table is unchanged.
216 */
217 int
218 GKIKeyIndex(GKI *hash, char *key)
219 {
220 struct gki_elem *ptr;
221 int val;
222
223 val = gki_hashvalue(hash, key);
224 for (ptr = hash->table[val]; ptr != NULL; ptr = ptr->nxt)
225 if (strcmp(key, ptr->key) == 0) return ptr->idx;
226 return -1;
227 }
228
229 /* Function: GKIStatus()
230 * Date: SRE, Sat May 1 11:11:13 1999 [St. Louis]
231 *
232 * Purpose: (DEBUGGING) How are we doing? Calculate some
233 * simple statistics for the hash table.
234 *
235 * Args: hash - the GKI hash table to look at
236 *
237 * Returns: (void)
238 * Prints diagnostics on stdout.
239 * hash table is unchanged.
240 */
241 void
242 GKIStatus(GKI *hash)
243 {
244 struct gki_elem *ptr;
245 int i;
246 int nkeys;
247 int nempty = 0;
248 int maxkeys = -1;
249 int minkeys = INT_MAX;
250
251 for (i = 0; i < hash->nhash; i++)
252 {
253 nkeys = 0;
254 for (ptr = hash->table[i]; ptr != NULL; ptr = ptr->nxt)
255 nkeys++;
256
257 if (nkeys == 0) nempty++;
258 if (nkeys > maxkeys) maxkeys = nkeys;
259 if (nkeys < minkeys) minkeys = nkeys;
260 }
261
262 printf("Total keys: %d\n", hash->nkeys);
263 printf("Hash table size: %d\n", hash->nhash);
264 printf("Average occupancy: %.1f\n", (float) hash->nkeys / (float) hash->nhash);
265 printf("Unoccupied slots: %d\n", nempty);
266 printf("Most in one slot: %d\n", maxkeys);
267 printf("Least in one slot: %d\n", minkeys);
268
269 }
270
271
272 /* Function: gki_alloc()
273 * Date: SRE, Sat May 1 11:55:47 1999 [May Day geek-out]
274 *
275 * Purpose: Allocate a hash table structure with the
276 * size given by primelevel.
277 *
278 * Args: primelevel - level 0..GKI_NPRIMES-1, specifying
279 * the size of the table; see gki_primes[]
280 * array.
281 *
282 * Returns: An allocated hash table structure.
283 * Caller frees with GKIFree().
284 */
285 static GKI *
286 gki_alloc(int primelevel)
287 {
288 GKI *hash;
289 int i;
290
291 if (primelevel < 0 || primelevel >= GKI_NPRIMES)
292 Die("bad primelevel in gki_alloc()");
293 hash = MallocOrDie(sizeof(GKI));
294
295 hash->primelevel = primelevel;
296 hash->nhash = gki_primes[hash->primelevel];
297 hash->table = MallocOrDie(sizeof(struct gki_elem) * hash->nhash);
298 for (i = 0; i < hash->nhash; i++)
299 hash->table[i] = NULL;
300 hash->nkeys = 0;
301 return hash;
302 }
303
304
305 /* Function: gki_hashvalue()
306 * Date: SRE, Sat May 1 11:14:10 1999 [May Day geek-out]
307 *
308 * Purpose: Calculate the hash value for a key. Usually
309 * we expect a one-word key, but the function will
310 * hash any ASCII string effectively. The hash function
311 * is a simple one (see p. 233 of Sedgewick,
312 * Algorithms in C).
313 * Slightly optimized: does two characters at a time
314 * before doing the modulo; this gives us a significant
315 * speedup.
316 *
317 * Args: hash - the gki structure (we need to know the hash table size)
318 * key - a string to calculate the hash value for
319 *
320 * Returns: a hash value, in the range 0..hash->nhash-1.
321 * hash table is unmodified.
322 */
323 static int
324 gki_hashvalue(GKI *hash, char *key)
325 {
326 int val = 0;
327
328 for (; *key != '\0'; key++)
329 {
330 val = GKI_ALPHABETSIZE*val + *key;
331 if (*(++key) == '\0') { val = val % hash->nhash; break; }
332 val = (GKI_ALPHABETSIZE*val + *key) % hash->nhash;
333 }
334 return val;
335 }
336
337 /* Function: gki_upsize()
338 * Date: SRE, Sat May 1 11:46:07 1999 [May Day geek-out]
339 *
340 * Purpose: Grow the hash table to the next available size.
341 *
342 * Args: old - the GKI hash table to reallocate.
343 *
344 * Returns: 1 on success (the hash table is changed);
345 * 0 on failure; the table is already at its maximum size,
346 * and the hash table is returned unchanged.
347 */
348 static int
349 gki_upsize(GKI *old)
350 {
351 GKI *new;
352 int i;
353 struct gki_elem *optr;
354 struct gki_elem *nptr;
355 int val;
356
357 if (old->primelevel >= GKI_NPRIMES-1) return 0;
358 new = gki_alloc(old->primelevel+1);
359
360 /* Read the old, store in the new, while *not changing*
361 * any key indices. Because of the way the lists are
362 * treated as LIFO stacks, all the lists are reversed
363 * in the new structure.
364 */
365 for (i = 0; i < old->nhash; i++)
366 {
367 optr = old->table[i];
368 while (optr != NULL)
369 {
370 val = gki_hashvalue(new, optr->key);
371
372 nptr = new->table[val];
373 new->table[val] = optr;
374 optr = optr->nxt;
375 new->table[val]->nxt = nptr;
376 }
377 }
378 free(old->table);
379
380 /* Now swap within the interior of the structures, so the old
381 * structure is updated to the new structure.
382 * (nkeys is identical, so we don't need to swap that element.)
383 */
384 old->primelevel = new->primelevel;
385 old->nhash = new->nhash;
386 old->table = new->table;
387 free(new);
388 return 1;
389 }