comparison clustalomega/clustal-omega-0.2.0/src/squid/ssi.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <sys/stat.h>
14 #include <sys/types.h>
15 #include <unistd.h>
16 #include "squid.h"
17 #include "ssi.h"
18 #ifdef CLUSTALO
19 #include <limits.h>
20 #endif
21
22 static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */
23 static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */
24
25 static int read_i16(FILE *fp, sqd_uint16 *ret_result);
26 static int read_i32(FILE *fp, sqd_uint32 *ret_result);
27 static int read_i64(FILE *fp, sqd_uint64 *ret_result);
28 static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset);
29 static int write_i16(FILE *fp, sqd_uint16 n);
30 static int write_i32(FILE *fp, sqd_uint32 n);
31 static int write_i64(FILE *fp, sqd_uint64 n);
32 static int write_offset(FILE *fp, SSIOFFSET *offset);
33 static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base,
34 sqd_uint32 recsize, sqd_uint32 maxidx);
35 static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len,
36 sqd_uint32 n);
37 static void clear_ssifile(SSIFILE *sfp);
38 static sqd_uint64 current_index_size(SSIINDEX *g);
39 static int activate_external_sort(SSIINDEX *g);
40 static int load_indexfile(SSIFILE *sfp);
41 static int parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey);
42 static int parse_skey_info(char *buf, struct ssiskey_s *skey);
43
44 /* Function: SSIOpen()
45 * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis]
46 *
47 * Purpose: Opens the SSI index file {filename} and returns
48 * a SSIFILE * stream thru {ret_sfp}.
49 * The caller must eventually close this stream using
50 * SSIClose(). More than one index file can be open
51 * at once.
52 *
53 * Args: filename - full path to a SSI index file
54 *
55 * Returns: Returns 0 on success, nonzero on failure.
56 */
57 int
58 SSIOpen(char *filename, SSIFILE **ret_sfp)
59 {
60 SSIFILE *sfp = NULL;
61 int status;
62 if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC;
63 if ((sfp->fp = fopen(filename, "rb")) == NULL) {
64 free(sfp);
65 return SSI_ERR_NOFILE;
66 }
67 status = load_indexfile(sfp);
68 *ret_sfp = sfp;
69 return status;
70 }
71 /* load_indexfile(): given a SSIFILE structure with an open and positioned
72 * stream (fp) -- but no other data loaded -- read the next SSIFILE
73 * in from disk. We use this routine without its SSIOpen() wrapper
74 * as part of the external mergesort when creating large indices.
75 */
76 static int
77 load_indexfile(SSIFILE *sfp)
78 {
79 sqd_uint32 magic;
80 sqd_uint16 i; /* counter over files */
81 int status; /* overall return status if an error is thrown */
82
83 status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */
84
85 sfp->filename = NULL;
86 sfp->fileformat = NULL;
87 sfp->fileflags = NULL;
88 sfp->bpl = NULL;
89 sfp->rpl = NULL;
90 sfp->nfiles = 0;
91 if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; }
92 if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; }
93 if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE;
94
95 /* If we have 64-bit offsets, make sure we can deal with them.
96 */
97 #ifndef HAS_64BIT_FILE_OFFSETS
98 if ((sfp->flags & SSI_USE64_INDEX) ||
99 (sfp->flags & SSI_USE64))
100 { status = SSI_ERR_NO64BIT; goto FAILURE; }
101 #endif
102
103 sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32;
104 sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32;
105
106 if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE;
107 if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE;
108 if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE;
109 if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE;
110 if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE;
111 if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE;
112 if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE;
113 if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE;
114 if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE;
115
116 if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE;
117 if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE;
118 if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE;
119
120 /* Read the file information and keep it.
121 * We expect the number of files to be small, so reading it
122 * once should be advantageous overall. If SSI ever had to
123 * deal with large numbers of files, you'd probably want to
124 * read file information on demand.
125 */
126 if (sfp->nfiles == 0) goto FAILURE;
127 if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
128 for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL;
129 if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
130 if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
131 if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
132 if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
133
134 for (i = 0; i < sfp->nfiles; i++)
135 {
136 /* We have to explicitly position, because header and file
137 * records may expand in the future; frecsize and foffset
138 * give us forwards compatibility.
139 */
140 if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE;
141 if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; }
142 if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE;
143 if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE;
144 if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE;
145 if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE;
146 if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE;
147 }
148
149 /* Success. Return 0.
150 */
151 return 0;
152
153 FAILURE:
154 /* Failure: free the damaged structure, return status code.
155 */
156 SSIClose(sfp);
157 return status;
158 }
159
160
161
162 /* Function: SSIGetOffsetByName()
163 * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis]
164 *
165 * Purpose: Looks up the string {key} in the open index {sfp}.
166 * {key} can be either a primary or secondary key. If {key}
167 * is found, {*ret_fh} contains a unique handle on
168 * the file that contains {key} (suitable for an SSIFileInfo()
169 * call, or for comparison to the handle of the last file
170 * that was opened for retrieval), and {offset} is filled
171 * in with the offset in that file.
172 *
173 * Args: sfp - open index file
174 * key - string to search for
175 * ret_fh - RETURN: handle on file that key is in
176 * ret_offset - RETURN: offset of the start of that key's record
177 *
178 * Returns: 0 on success.
179 * non-zero on error.
180 */
181 int
182 SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh,
183 SSIOFFSET *ret_offset)
184 {
185 int status;
186 sqd_uint16 fnum;
187
188 /* Look in the primary keys.
189 */
190 status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize,
191 sfp->nprimary);
192 if (status == 0) {
193 /* We found it as a primary key; get our data & return.
194 */
195 if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA;
196 *ret_fh = (int) fnum;
197 if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA;
198
199 return 0; /* success! (we don't need the other key data) */
200 } else if (status == SSI_ERR_NO_SUCH_KEY) {
201 /* Not in the primary keys? OK, try the secondary keys.
202 */
203 if (sfp->nsecondary > 0) {
204 char *pkey;
205 status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize,
206 sfp->nsecondary);
207 if (status != 0) return status;
208 if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC;
209 if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA;
210
211 status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset);
212 free(pkey);
213 }
214 return status;
215
216 } else return status;
217 /*NOTREACHED*/
218 }
219
220 /* Function: SSIGetOffsetByNumber()
221 * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis]
222 *
223 * Purpose: Looks up primary key #{n} in the open index {sfp}.
224 * {n} ranges from 0..nprimary-1. When key #{n}
225 * is found, {*ret_fh} contains a unique
226 * handle on the file that contains {key} (suitable
227 * for an SSIFileInfo() call, or for comparison to
228 * the handle of the last file that was opened for retrieval),
229 * and {offset} is filled in with the offset in that file.
230 *
231 * Args: sfp - open index file
232 * n - primary key number to retrieve.
233 * ret_fh - RETURN: handle on file that key is in
234 * ret_offset - RETURN: offset of the start of that key's record
235 *
236 * Returns: 0 on success.
237 * non-zero on error.
238 */
239 int
240 SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset)
241 {
242 sqd_uint16 fnum;
243 char *pkey;
244
245 if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY;
246 if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0)
247 return SSI_ERR_SEEK_FAILED;
248
249 if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC;
250 if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA;
251 if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA;
252 if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA;
253 *ret_fh = fnum;
254 free(pkey);
255 return 0;
256 }
257
258 /* Function: SSIGetSubseqOffset()
259 * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis]
260 *
261 * Purpose: Implements SSI_FAST_SUBSEQ.
262 *
263 * Looks up a primary or secondary {key} in the open
264 * index {sfp}. Asks for the nearest offset to a
265 * subsequence starting at position {requested_start}
266 * in the sequence (numbering the sequence 1..L).
267 * If {key} is found, on return, {ret_fh}
268 * contains a unique handle on the file that contains
269 * {key} (suitable for an SSIFileInfo() call, or for
270 * comparison to the handle of the last file that was
271 * opened for retrieval); {record_offset} contains the
272 * disk offset to the start of the record; {data_offset}
273 * contains the disk offset either exactly at the requested
274 * residue, or at the start of the line containing the
275 * requested residue; {ret_actual_start} contains the
276 * coordinate (1..L) of the first valid residue at or
277 * after {data_offset}. {ret_actual_start} is <=
278 * {requested_start}.
279 *
280 * Args: sfp - open index file
281 * key - primary or secondary key to find
282 * requested_start - residue we'd like to start at (1..L)
283 * ret_fh - RETURN: handle for file the key is in
284 * record_offset - RETURN: offset of entire record
285 * data_offset - RETURN: offset of subseq (see above)
286 * ret_actual_start- RETURN: coord (1..L) of residue at data_offset
287 *
288 * Returns: 0 on success, non-zero on failure.
289 */
290 int
291 SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start,
292 int *ret_fh, SSIOFFSET *record_offset,
293 SSIOFFSET *data_offset, int *ret_actual_start)
294 {
295 int status;
296 sqd_uint32 len;
297 int r, b, i, l; /* tmp variables for "clarity", to match docs */
298
299 /* Look up the key. Rely on the fact that SSIGetOffsetByName()
300 * leaves the index file positioned at the rest of the data for this key.
301 */
302 status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset);
303 if (status != 0) return status;
304
305 /* Check that we're allowed to do subseq lookup on that file.
306 */
307 if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ))
308 return SSI_ERR_NO_SUBSEQS;
309
310 /* Read the data we need for subseq lookup
311 */
312 if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA;
313 if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA;
314
315 /* Set up tmp variables for clarity of equations below,
316 * and to make them match documentation (ssi-format.tex).
317 */
318 r = sfp->rpl[*ret_fh]; /* residues per line */
319 b = sfp->bpl[*ret_fh]; /* bytes per line */
320 i = requested_start; /* start position 1..L */
321 l = (i-1)/r; /* data line # (0..) that the residue is on */
322 if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS;
323 if (i < 0 || i > len) return SSI_ERR_RANGE;
324
325 /* When b = r+1, there's nothing but sequence on each data line (and the \0),
326 * and we can find each residue precisely.
327 */
328 if (b == r+1) {
329 if (sfp->smode == SSI_OFFSET_I32) {
330 data_offset->mode = SSI_OFFSET_I32;
331 data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r;
332 } else if (sfp->smode == SSI_OFFSET_I64) {
333 data_offset->mode = SSI_OFFSET_I64;
334 data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r;
335 }
336 *ret_actual_start = requested_start;
337 } else {
338 /* else, there's other stuff on seq lines, so the best
339 * we can do easily is to position at start of relevant line.
340 */
341 if (sfp->smode == SSI_OFFSET_I32) {
342 data_offset->mode = SSI_OFFSET_I32;
343 data_offset->off.i32 = data_offset->off.i32 + l*b;
344 } else if (sfp->smode == SSI_OFFSET_I64) {
345 data_offset->mode = SSI_OFFSET_I64;
346 data_offset->off.i64 = data_offset->off.i64 + l*b;
347 }
348 /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */
349 *ret_actual_start = 1 + l*r;
350 }
351 return 0;
352 }
353
354 /* Function: SSISetFilePosition()
355 * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis]
356 *
357 * Purpose: Uses {offset} to sets the file position for {fp}, usually an
358 * open sequence file, relative to the start of the file.
359 * Hides the details of system-dependent shenanigans necessary for
360 * file positioning in large (>2 GB) files.
361 *
362 * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit
363 * offsets and <2 GB files.
364 *
365 * Warning: if all else fails, in desperation, it will try to
366 * use fsetpos(). This requires making assumptions about fpos_t
367 * that may be unwarranted... assumptions that ANSI C prohibits
368 * me from making... though I believe the ./configure
369 * script robustly tests whether I can play with fpos_t like this.
370 *
371 * Args: fp - file to position.
372 * offset - SSI offset relative to file start.
373 *
374 * Returns: 0 on success, nonzero on error.
375 */
376 int
377 SSISetFilePosition(FILE *fp, SSIOFFSET *offset)
378 {
379 if (offset->mode == SSI_OFFSET_I32) {
380 if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
381 }
382 #ifndef HAS_64BIT_FILE_OFFSETS
383 else return SSI_ERR_NO64BIT;
384 #elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8
385 else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
386 #elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8
387 else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
388 #elif defined HAVE_FSEEK64
389 else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED;
390 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
391 else if (fsetpos(fp, &(offset->off.i64)) != 0) return SSI_ERR_SEEK_FAILED;
392 #endif
393 return 0;
394 }
395
396
397 /* Function: SSIFileInfo()
398 * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis]
399 *
400 * Purpose: Given a file number {fh} in an open index file
401 * {sfp}, retrieve file name {ret_filename} and
402 * the file format {ret_format}.
403 *
404 * {ret_filename} is a pointer to a string maintained
405 * internally by {sfp}. It should not be free'd;
406 * SSIClose(sfp) takes care of it.
407 *
408 * Args: sfp - open index file
409 * fh - handle on file to look up
410 * ret_filename - RETURN: name of file n
411 * ret_format - RETURN: format of file n
412 *
413 * Returns: 0 on success, nonzero on failure.
414 */
415 int
416 SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format)
417 {
418 if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG;
419 *ret_filename = sfp->filename[fh];
420 *ret_format = sfp->fileformat[fh];
421 return 0;
422 }
423
424 /* Function: SSIClose()
425 * Date: SRE, Sun Dec 31 14:56:37 2000 [St. Louis]
426 *
427 * Purpose: Close an open {SSIFILE *}.
428 *
429 * Args: sfp - index file to close.
430 *
431 * Returns: (void)
432 */
433 void
434 SSIClose(SSIFILE *sfp)
435 {
436 if (sfp != NULL) {
437 clear_ssifile(sfp);
438 if (sfp->fp != NULL) fclose(sfp->fp);
439 free(sfp);
440 }
441 }
442 /* clear_ssifile(): free the innards of SSIFILE, without
443 * destroying the structure or closing the stream.
444 */
445 static void
446 clear_ssifile(SSIFILE *sfp)
447 {
448 int i;
449
450 if (sfp->filename != NULL) {
451 for (i = 0; i < sfp->nfiles; i++)
452 if (sfp->filename[i] != NULL) free(sfp->filename[i]);
453 free(sfp->filename);
454 }
455 if (sfp->fileformat != NULL) free(sfp->fileformat);
456 if (sfp->fileflags != NULL) free(sfp->fileflags);
457 if (sfp->bpl != NULL) free(sfp->bpl);
458 if (sfp->rpl != NULL) free(sfp->rpl);
459 }
460
461
462 /* Function: SSIRecommendMode()
463 * Date: SRE, Fri Feb 16 08:23:47 2001 [St. Louis]
464 *
465 * Purpose: Examines the file and determines whether it should be
466 * indexed with large file support or not; returns
467 * SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large
468 * files, or -1 on failure.
469 *
470 * Args: file - name of file to check for size
471 *
472 * Returns: -1 on failure (including case where file is too big)
473 * SSI_OFFSET_I32 for most files (<= 2^31-1 bytes)
474 * SSI_OFFSET_I64 for large files (> 2^31-1 bytes)
475 */
476 int
477 SSIRecommendMode(char *file)
478 {
479 #if HAVE_STAT64
480 struct stat64 s1;
481 if (stat64(file, &s1) == 0) {
482 if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32;
483 else return SSI_OFFSET_I64;
484 }
485 #else
486 struct stat s2;
487 if (stat(file, &s2) == 0) {
488 if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32;
489 else return SSI_OFFSET_I64;
490 }
491 #endif
492 return -1;
493 }
494
495
496 /* Function: SSICreateIndex()
497 * Date: SRE, Tue Jan 2 11:23:25 2001 [St. Louis]
498 *
499 * Purpose: Creates and initializes a SSI index structure.
500 * Sequence file offset type is specified by {mode}.
501 *
502 * Args: mode - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode.
503 *
504 * Returns: ptr to new index structure, or NULL on failure.
505 * Caller is responsible for free'ing the returned
506 * structure with SSIFreeIndex().
507 */
508 SSIINDEX *
509 SSICreateIndex(int mode)
510 {
511 SSIINDEX *g;
512
513 g = NULL;
514 if ((g = malloc(sizeof(SSIINDEX))) == NULL) goto FAILURE;
515 g->smode = mode;
516 g->imode = SSI_OFFSET_I32; /* index always starts as 32-bit; may get upgraded later */
517 g->external = FALSE;
518 g->max_ram = SSI_MAXRAM;
519
520 #ifndef HAS_64BIT_FILE_OFFSETS
521 if (mode == SSI_OFFSET_I64)
522 Die("\
523 Can't create a 64-bit SSI index on this system, sorry;\n\
524 I don't have 64-bit file offset functions available.\n");
525 #endif
526
527 g->filenames = NULL;
528 g->fileformat = NULL;
529 g->bpl = NULL;
530 g->rpl = NULL;
531 g->flen = 0;
532 g->nfiles = 0;
533
534 g->pkeys = NULL;
535 g->plen = 0;
536 g->nprimary = 0;
537 g->ptmpfile = "tmp.ssi.1"; /* hardcoded, for now. */
538 g->ptmp = NULL;
539
540 g->skeys = NULL;
541 g->slen = 0;
542 g->nsecondary = 0;
543 g->stmpfile = "tmp.ssi.2"; /* hardcoded, for now. */
544 g->stmp = NULL;
545
546 /* All mallocs must go after NULL initializations, because of the cleanup strategy;
547 * we'll try to free anything non-NULL if a malloc fails.
548 */
549 if ((g->filenames = malloc(sizeof(char *) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
550 if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
551 if ((g->bpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
552 if ((g->rpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE;
553
554 if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE;
555 if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE;
556
557 return g;
558
559 FAILURE:
560 SSIFreeIndex(g); /* free the damaged structure */
561 return NULL;
562 }
563
564 /* Function: SSIGetFilePosition()
565 * Date: SRE, Tue Jan 2 09:59:26 2001 [St. Louis]
566 *
567 * Purpose: Fills {ret_offset} with the current disk
568 * offset of {fp}, relative to the start of the file.
569 * {mode} is set to either SSI_OFFSET_I32 or
570 * SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps
571 * a call to ftell(); otherwise deals with system-dependent
572 * details of 64-bit file offsets.
573 *
574 * Args: fp - open stream
575 * mode - SSI_OFFSET_I32 or SSI_OFFSET_I64
576 * ret_offset - RETURN: file position
577 *
578 * Returns: 0 on success. nonzero on error.
579 */
580 int
581 SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset)
582 {
583 if (mode == SSI_OFFSET_I32)
584 {
585 ret_offset->mode = SSI_OFFSET_I32;
586 ret_offset->off.i32 = ftell(fp);
587 if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED;
588 }
589 else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */
590 else {
591 ret_offset->mode = SSI_OFFSET_I64;
592 #ifndef HAS_64BIT_FILE_OFFSETS
593 return SSI_ERR_NO64BIT;
594 #elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8
595 if ((ret_offset->off.i64 = ftello(fp)) == -1) return SSI_ERR_TELL_FAILED;
596 #elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8
597 if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED;
598 #elif defined HAVE_FTELL64
599 if ((ret_offset->off.i64 = ftell64(fp)) == -1) return SSI_ERR_TELL_FAILED;
600 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
601 if (fgetpos(fp, &(ret_offset->off.i64)) != 0) return SSI_ERR_TELL_FAILED;
602 #endif
603 }
604 return 0;
605 }
606
607 /* Function: SSIAddFileToIndex()
608 * Date: SRE, Tue Jan 2 12:54:36 2001 [St. Louis]
609 *
610 * Purpose: Adds the sequence file {filename}, which is known to
611 * be in format {fmt}, to the index {g}. Creates and returns
612 * a unique filehandle {fh} for then associating primary keys
613 * with this file using SSIAddPrimaryKeyToIndex().
614 *
615 * Args: g - active index
616 * filename - file to add
617 * fmt - format code for this file (e.g. SQFILE_FASTA)
618 * ret_fh - RETURN: unique handle for this file
619 *
620 * Returns: 0 on success; nonzero on error.
621 */
622 int
623 SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh)
624 {
625 int n;
626
627 if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES;
628
629 n = strlen(filename);
630 if ((n+1) > g->flen) g->flen = n+1;
631
632 g->filenames[g->nfiles] = FileTail(filename, FALSE);
633 g->fileformat[g->nfiles] = fmt;
634 g->bpl[g->nfiles] = 0;
635 g->rpl[g->nfiles] = 0;
636 *ret_fh = g->nfiles; /* handle is simply = file number */
637 g->nfiles++;
638
639 if (g->nfiles % SSI_FILE_BLOCK == 0) {
640 g->filenames = realloc(g->filenames, sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK));
641 if (g->filenames == NULL) return SSI_ERR_MALLOC;
642 g->fileformat= realloc(g->fileformat, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK));
643 if (g->fileformat == NULL) return SSI_ERR_MALLOC;
644 g->bpl = realloc(g->bpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK));
645 if (g->bpl == NULL) return SSI_ERR_MALLOC;
646 g->rpl = realloc(g->rpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK));
647 if (g->rpl == NULL) return SSI_ERR_MALLOC;
648 }
649 return 0;
650 }
651
652
653 /* Function: SSISetFileForSubseq()
654 * Date: SRE, Tue Jan 9 10:02:05 2001 [St. Louis]
655 *
656 * Purpose: Set SSI_FAST_SUBSEQ for the file indicated by
657 * filehandle {fh} in the index {g}, setting
658 * parameters {bpl} and {rpl} to the values given.
659 * {bpl} is the number of bytes per sequence data line.
660 * {rpl} is the number of residues per sequence data line.
661 * Caller must be sure that {bpl} and {rpl} do not change
662 * on any line of any sequence record in the file
663 * (except for the last data line of each record). If
664 * this is not the case in this file, SSI_FAST_SUBSEQ
665 * will not work, and this routine should not be
666 * called.
667 *
668 * Args: g - the active index
669 * fh - handle for file to set SSI_FAST_SUBSEQ on
670 * bpl - bytes per data line
671 * rpl - residues per data line
672 *
673 * Returns: 0 on success; 1 on error.
674 */
675 int
676 SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl)
677 {
678 if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG;
679 if (bpl <= 0 || rpl <= 0) return SSI_ERR_BADARG;
680 g->bpl[fh] = bpl;
681 g->rpl[fh] = rpl;
682 return 0;
683 }
684
685
686 /* Function: SSIAddPrimaryKeyToIndex()
687 * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis]
688 *
689 * Purpose: Put primary key {key} in the index {g}, while telling
690 * the index this primary key is in the file associated
691 * with filehandle {fh} (returned by a previous call
692 * to SSIAddFileToIndex()), and its record starts at
693 * position {r_off} in the file.
694 *
695 * {d_off} and {L} are optional; they may be left unset
696 * by passing NULL and 0, respectively. (If one is
697 * provided, both must be provided.) If they are provided,
698 * {d_off} gives the position of the first line of sequence
699 * data in the record, and {L} gives the length of
700 * the sequence in residues. They are used when
701 * SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ
702 * is not set for the file, {d_off} and {L} will be
703 * ignored by the index reading API even if they are stored
704 * by the index writing API, so it doesn't hurt for the
705 * indexing program to provide them; typically they
706 * won't know whether it's safe to set SSI_FAST_SUBSEQ
707 * for the whole file until the whole file has been
708 * read and every key has already been added to the index.
709 *
710 * Args: g - active index
711 * key - primary key to add
712 * fh - handle on file that this key's in
713 * r_off - offset to start of record
714 * d_off - offset to start of sequence data
715 * L - length of sequence, or 0
716 *
717 * Returns: 0 on success, nonzero on error.
718 */
719 int
720 SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh,
721 SSIOFFSET *r_off, SSIOFFSET *d_off, int L)
722 {
723 int n; /* a string length */
724
725 if (fh >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES;
726 if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS;
727 if (L > 0 && d_off == NULL) abort(); /* need both. */
728
729 /* Before adding the key: check how big our index is.
730 * If it's getting too large, switch to external mode.
731 */
732 if (!g->external && current_index_size(g) >= g->max_ram)
733 if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE;
734
735 /* Update maximum pkey length, if needed.
736 */
737 n = strlen(key);
738 if ((n+1) > g->plen) g->plen = n+1;
739
740 /* External mode? Simply append to disk...
741 */
742 if (g->external) {
743 if (g->smode == SSI_OFFSET_I32) {
744 fprintf(g->ptmp, "%s\t%d\t%lu\t%lu\t%lu\n",
745 key, fh, (unsigned long) r_off->off.i32,
746 (unsigned long) (d_off == NULL? 0 : d_off->off.i32),
747 (unsigned long) L);
748 } else {
749 #ifdef CLUSTALO
750 fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n",
751 key, fh, (unsigned long long)r_off->off.i64,
752 d_off == NULL? 0 : (unsigned long long) d_off->off.i64,
753 (unsigned long) L);
754 #else
755 fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n",
756 key, fh, r_off->off.i64,
757 d_off == NULL? 0 : d_off->off.i64,
758 (unsigned long) L);
759 #endif
760 }
761 g->nprimary++;
762 return 0;
763 }
764
765 /* Else: internal mode, keep keys in memory...
766 */
767 if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC;
768 g->pkeys[g->nprimary].fnum = (sqd_uint16) fh;
769 g->pkeys[g->nprimary].r_off = *r_off;
770 if (d_off != NULL && L > 0) {
771 g->pkeys[g->nprimary].d_off = *d_off;
772 g->pkeys[g->nprimary].len = L;
773 } else {
774 /* yeah, this looks stupid, but look: we have to give a valid
775 looking, non-NULL d_off of some sort, or writes will fail.
776 It's going to be unused anyway. */
777 g->pkeys[g->nprimary].d_off = *r_off;
778 g->pkeys[g->nprimary].len = 0;
779 }
780 g->nprimary++;
781
782 if (g->nprimary % SSI_KEY_BLOCK == 0) {
783 g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK));
784 if (g->pkeys == NULL) return SSI_ERR_MALLOC;
785 }
786 return 0;
787 }
788
789
790 /* Function: SSIAddSecondaryKeyToIndex()
791 * Date: SRE, Tue Jan 2 12:44:40 2001 [St. Louis]
792 *
793 * Purpose: Puts secondary key {key} in the index {g}, associating
794 * it with primary key {pkey} that was previously
795 * registered by SSIAddPrimaryKeyToIndex().
796 *
797 * Args: g - active index
798 * key - secondary key to add
799 * pkey - primary key to associate this key with
800 *
801 * Returns: 0 on success, nonzero on failure.
802 */
803 int
804 SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey)
805 {
806 int n; /* a string length */
807
808 if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS;
809
810 /* Before adding the key: check how big our index is.
811 * If it's getting too large, switch to external mode.
812 */
813 if (!g->external && current_index_size(g) >= g->max_ram)
814 if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE;
815
816 /* Update maximum secondary key length, if necessary.
817 */
818 n = strlen(key);
819 if ((n+1) > g->slen) g->slen = n+1;
820
821 /* if external mode: write info to disk.
822 */
823 if (g->external) {
824 fprintf(g->stmp, "%s\t%s\n", key, pkey);
825 g->nsecondary++;
826 return 0;
827 }
828
829 /* else, internal mode... store info in memory.
830 */
831 if ((g->skeys[g->nsecondary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC;
832 if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC;
833 g->nsecondary++;
834
835 if (g->nsecondary % SSI_KEY_BLOCK == 0) {
836 g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK));
837 if (g->skeys == NULL) return SSI_ERR_MALLOC;
838 }
839 return 0;
840 }
841
842
843
844
845 /* Function: SSIWriteIndex()
846 * Date: SRE, Tue Jan 2 13:55:56 2001 [St. Louis]
847 *
848 * Purpose: Writes complete index {g} in SSI format to a
849 * binary file {file}. Does all
850 * the overhead of sorting the primary and secondary keys,
851 * and maintaining the association of secondary keys
852 * with primary keys during and after the sort.
853 *
854 * Args: file - file to write to
855 * g - index to sort & write out.
856 *
857 * Returns: 0 on success, nonzero on error.
858 */
859 /* needed for qsort() */
860 static int
861 pkeysort(const void *k1, const void *k2)
862 {
863 struct ssipkey_s *key1;
864 struct ssipkey_s *key2;
865 key1 = (struct ssipkey_s *) k1;
866 key2 = (struct ssipkey_s *) k2;
867 return strcmp(key1->key, key2->key);
868 }
869 static int
870 skeysort(const void *k1, const void *k2)
871 {
872 struct ssiskey_s *key1;
873 struct ssiskey_s *key2;
874 key1 = (struct ssiskey_s *) k1;
875 key2 = (struct ssiskey_s *) k2;
876 return strcmp(key1->key, key2->key);
877 }
878 int
879 SSIWriteIndex(char *file, SSIINDEX *g)
880 {
881 FILE *fp;
882 int status;
883 int i;
884 sqd_uint32 header_flags, file_flags;
885 sqd_uint32 frecsize, precsize, srecsize;
886 sqd_uint64 foffset, poffset, soffset;
887 char *s, *s2;
888
889 if ((fp = fopen(file,"wb")) == NULL) return SSI_ERR_NOFILE;
890 status = 0;
891
892 /* How big is the index? If it's going to be > 2GB, we need
893 * to flip to 64-bit index mode. 2047 (instead of 2048) gives us
894 * some slop room.
895 * die'ing here is pretty brutal - if we flip to 64-bit index
896 * mode, we hve 100's of millions of keys, so we've processed
897 * a long time before reaching this point. Ah well.
898 */
899 if (current_index_size(g) >= 2047) {
900 g->imode = SSI_OFFSET_I64;
901 #ifndef HAS_64BIT_FILE_OFFSETS
902 Die("\
903 Can't switch to 64-bit SSI index mode on this system, sorry;\n\
904 I don't have 64-bit file offset functions available.\n");
905 #endif
906 }
907
908 /* Magic-looking numbers come from adding up sizes
909 * of things in bytes
910 */
911 frecsize = 16 + g->flen;
912 precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen;
913 srecsize = g->slen + g->plen;
914
915 header_flags = 0;
916 if (g->smode == SSI_OFFSET_I64) header_flags |= SSI_USE64;
917 if (g->imode == SSI_OFFSET_I64) header_flags |= SSI_USE64_INDEX;
918
919 /* Magic-looking numbers again come from adding up sizes
920 * of things in bytes
921 */
922 foffset = (header_flags & SSI_USE64_INDEX) ? 66 : 54;
923 poffset = foffset + frecsize*g->nfiles;
924 soffset = poffset + precsize*g->nprimary;
925
926 /* Sort the keys
927 * If external mode, make system calls to UNIX/POSIX "sort" in place, then
928 * open new sorted files for reading thru ptmp and stmp handles.
929 * If internal mode, call qsort.
930 *
931 * Note that you'd better force a POSIX locale for the sort; else,
932 * some silly distro (e.g. Mandrake Linux >=8.1) may have specified
933 * LC_COLLATE=en_US, and this'll give a sort "bug" in which it doesn't
934 * sort by byte order.
935 */
936 if (g->external) {
937 char cmd[1024];
938
939 fclose(g->ptmp);
940 g->ptmp = NULL;
941 sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", g->ptmpfile, g->ptmpfile);
942 if ((status = system(cmd)) != 0) return SSI_ERR_EXTERNAL_SORT;
943 if ((g->ptmp = fopen(g->ptmpfile, "r")) == NULL) return SSI_ERR_EXTERNAL_SORT;
944
945 fclose(g->stmp);
946 g->stmp = NULL;
947 sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", g->stmpfile, g->stmpfile);
948 if ((status = system(cmd)) != 0) return SSI_ERR_EXTERNAL_SORT;
949 if ((g->stmp = fopen(g->stmpfile, "r")) == NULL) return SSI_ERR_EXTERNAL_SORT;
950 } else {
951 qsort((void *) g->pkeys, g->nprimary, sizeof(struct ssipkey_s), pkeysort);
952 qsort((void *) g->skeys, g->nsecondary, sizeof(struct ssiskey_s), skeysort);
953 }
954
955 /* Write the header
956 */
957 if (! write_i32(fp, v20magic)) return SSI_ERR_FWRITE;
958 if (! write_i32(fp, header_flags)) return SSI_ERR_FWRITE;
959 if (! write_i16(fp, g->nfiles)) return SSI_ERR_FWRITE;
960 if (! write_i32(fp, g->nprimary)) return SSI_ERR_FWRITE;
961 if (! write_i32(fp, g->nsecondary)) return SSI_ERR_FWRITE;
962 if (! write_i32(fp, g->flen)) return SSI_ERR_FWRITE;
963 if (! write_i32(fp, g->plen)) return SSI_ERR_FWRITE;
964 if (! write_i32(fp, g->slen)) return SSI_ERR_FWRITE;
965 if (! write_i32(fp, frecsize)) return SSI_ERR_FWRITE;
966 if (! write_i32(fp, precsize)) return SSI_ERR_FWRITE;
967 if (! write_i32(fp, srecsize)) return SSI_ERR_FWRITE;
968 if (g->imode == SSI_OFFSET_I32) {
969 if (! write_i32(fp, foffset)) return SSI_ERR_FWRITE;
970 if (! write_i32(fp, poffset)) return SSI_ERR_FWRITE;
971 if (! write_i32(fp, soffset)) return SSI_ERR_FWRITE;
972 } else {
973 if (! write_i64(fp, foffset)) return SSI_ERR_FWRITE;
974 if (! write_i64(fp, poffset)) return SSI_ERR_FWRITE;
975 if (! write_i64(fp, soffset)) return SSI_ERR_FWRITE;
976 }
977
978 /* The file section
979 */
980 if ((s = malloc(sizeof(char) * g->flen)) == NULL) return SSI_ERR_MALLOC;
981 for (i = 0; i < g->nfiles; i++)
982 {
983 file_flags = 0;
984 if (g->bpl[i] > 0 && g->rpl[i] > 0) file_flags |= SSI_FAST_SUBSEQ;
985
986 strcpy(s, g->filenames[i]);
987 if (fwrite(s, sizeof(char), g->flen, fp) != g->flen) return SSI_ERR_FWRITE;
988 if (! write_i32(fp, g->fileformat[i])) return SSI_ERR_FWRITE;
989 if (! write_i32(fp, file_flags)) return SSI_ERR_FWRITE;
990 if (! write_i32(fp, g->bpl[i])) return SSI_ERR_FWRITE;
991 if (! write_i32(fp, g->rpl[i])) return SSI_ERR_FWRITE;
992 }
993 free(s);
994
995 /* The primary key section
996 */
997 if ((s = malloc(sizeof(char) * g->plen)) == NULL) return SSI_ERR_MALLOC;
998 if (g->external) {
999 char *buf = NULL;
1000 int buflen = 0;
1001 struct ssipkey_s pkey;
1002 for (i = 0; i < g->nprimary; i++)
1003 {
1004 if (sre_fgets(&buf, &buflen, g->ptmp) == NULL) return SSI_ERR_NODATA;
1005 if (parse_pkey_info(buf, g->smode, &pkey) != 0) return SSI_ERR_BADFORMAT;
1006 strcpy(s, pkey.key);
1007 if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE;
1008 if (! write_i16( fp, pkey.fnum)) return SSI_ERR_FWRITE;
1009 if (! write_offset(fp, &(pkey.r_off))) return SSI_ERR_FWRITE;
1010 if (! write_offset(fp, &(pkey.d_off))) return SSI_ERR_FWRITE;
1011 if (! write_i32( fp, pkey.len)) return SSI_ERR_FWRITE;
1012 }
1013 free(buf);
1014 } else {
1015 for (i = 0; i < g->nprimary; i++)
1016 {
1017 strcpy(s, g->pkeys[i].key);
1018 if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE;
1019 if (! write_i16( fp, g->pkeys[i].fnum)) return SSI_ERR_FWRITE;
1020 if (! write_offset(fp, &(g->pkeys[i].r_off))) return SSI_ERR_FWRITE;
1021 if (! write_offset(fp, &(g->pkeys[i].d_off))) return SSI_ERR_FWRITE;
1022 if (! write_i32( fp, g->pkeys[i].len)) return SSI_ERR_FWRITE;
1023 }
1024 }
1025
1026 /* The secondary key section
1027 */
1028 if (g->nsecondary > 0) {
1029 if ((s2 = malloc(sizeof(char) * g->slen)) == NULL) return SSI_ERR_MALLOC;
1030
1031 if (g->external) {
1032 struct ssiskey_s skey;
1033 char *buf = NULL;
1034 int n = 0;
1035
1036 for (i = 0; i < g->nsecondary; i++)
1037 {
1038 if (sre_fgets(&buf, &n, g->stmp) == NULL) return SSI_ERR_NODATA;
1039 if (parse_skey_info(buf, &skey) != 0) return SSI_ERR_BADFORMAT;
1040 strcpy(s2, skey.key);
1041 strcpy(s, skey.pkey);
1042 if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE;
1043 if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE;
1044 }
1045 free(buf);
1046 } else {
1047 for (i = 0; i < g->nsecondary; i++)
1048 {
1049 strcpy(s2, g->skeys[i].key);
1050 strcpy(s, g->skeys[i].pkey);
1051 if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE;
1052 if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE;
1053 }
1054 }
1055 free(s2);
1056 }
1057
1058 free(s);
1059 fclose(fp);
1060 return status;
1061 }
1062
1063
1064 /* Function: SSIFreeIndex()
1065 * Date: SRE, Tue Jan 2 11:44:08 2001 [St. Louis]
1066 *
1067 * Purpose: Free an index structure {g}.
1068 *
1069 * Args: g - ptr to an open index.
1070 *
1071 * Returns: (void)
1072 */
1073 void
1074 SSIFreeIndex(SSIINDEX *g)
1075 {
1076 int i;
1077 if (g != NULL)
1078 {
1079 if (g->external == FALSE) {
1080 for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key);
1081 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key);
1082 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey);
1083 if (g->pkeys != NULL) free(g->pkeys);
1084 if (g->skeys != NULL) free(g->skeys);
1085 } else {
1086 if (g->ptmp != NULL) fclose(g->ptmp);
1087 if (g->stmp != NULL) fclose(g->stmp);
1088 #if DEBUGLEVEL == 0
1089 remove(g->ptmpfile);
1090 remove(g->stmpfile);
1091 #endif
1092 }
1093 for (i = 0; i < g->nfiles; i++) free(g->filenames[i]);
1094 if (g->filenames != NULL) free(g->filenames);
1095 if (g->fileformat != NULL) free(g->fileformat);
1096 if (g->bpl != NULL) free(g->bpl);
1097 if (g->rpl != NULL) free(g->rpl);
1098 free(g);
1099 }
1100 }
1101
1102
1103 /* Function: SSIErrorString()
1104 * Date: SRE, Tue Jan 2 10:38:10 2001 [St. Louis]
1105 *
1106 * Purpose: Returns a ptr to an internal string corresponding
1107 * to error {n}, a code returned from any of the
1108 * functions in the API that return non-zero on error.
1109 *
1110 * Args: n - error code
1111 *
1112 * Returns: ptr to an internal string.
1113 */
1114 char *
1115 SSIErrorString(int n)
1116 {
1117 switch (n) {
1118 case SSI_ERR_OK: return "ok (no error)";
1119 case SSI_ERR_NODATA: return "no data, fread() failed";
1120 case SSI_ERR_NO_SUCH_KEY: return "no such key";
1121 case SSI_ERR_MALLOC: return "out of memory, malloc() failed";
1122 case SSI_ERR_NOFILE: return "file not found, fopen() failed";
1123 case SSI_ERR_BADMAGIC: return "not a SSI file? (bad magic)";
1124 case SSI_ERR_BADFORMAT: return "corrupt format? unexpected data";
1125 case SSI_ERR_NO64BIT: return "no large file support for this system";
1126 case SSI_ERR_SEEK_FAILED: return "failed to reposition on disk";
1127 case SSI_ERR_TELL_FAILED: return "failed to get file position on disk";
1128 case SSI_ERR_NO_SUBSEQS: return "no fast subseq support for this seqfile";
1129 case SSI_ERR_RANGE: return "subseq start is out of range";
1130 case SSI_ERR_BADARG: return "an argument is out of range";
1131 case SSI_ERR_TOOMANY_FILES: return "number of files exceeds limit";
1132 case SSI_ERR_TOOMANY_KEYS: return "number of keys exceeds limit";
1133 case SSI_ERR_FWRITE: return "an fwrite() failed";
1134 case SSI_ERR_EXTERNAL_SORT: return "some problem with external sorting";
1135 default: return "unrecognized code";
1136 }
1137 /*NOTREACHED*/
1138 }
1139
1140 static int
1141 read_i16(FILE *fp, sqd_uint16 *ret_result)
1142 {
1143 sqd_uint16 result;
1144 if (fread(&result, sizeof(sqd_uint16), 1, fp) != 1) return 0;
1145 *ret_result = sre_ntoh16(result);
1146 return 1;
1147 }
1148 static int
1149 write_i16(FILE *fp, sqd_uint16 n)
1150 {
1151 n = sre_hton16(n);
1152 if (fwrite(&n, sizeof(sqd_uint16), 1, fp) != 1) return 0;
1153 return 1;
1154 }
1155 static int
1156 read_i32(FILE *fp, sqd_uint32 *ret_result)
1157 {
1158 sqd_uint32 result;
1159 if (fread(&result, sizeof(sqd_uint32), 1, fp) != 1) return 0;
1160 *ret_result = sre_ntoh32(result);
1161 return 1;
1162 }
1163 static int
1164 write_i32(FILE *fp, sqd_uint32 n)
1165 {
1166 n = sre_hton32(n);
1167 if (fwrite(&n, sizeof(sqd_uint32), 1, fp) != 1) return 0;
1168 return 1;
1169 }
1170 static int
1171 read_i64(FILE *fp, sqd_uint64 *ret_result)
1172 {
1173 sqd_uint64 result;
1174 if (fread(&result, sizeof(sqd_uint64), 1, fp) != 1) return 0;
1175 *ret_result = sre_ntoh64(result);
1176 return 1;
1177 }
1178 static int
1179 write_i64(FILE *fp, sqd_uint64 n)
1180 {
1181 n = sre_hton64(n);
1182 if (fwrite(&n, sizeof(sqd_uint64), 1, fp) != 1) return 0;
1183 return 1;
1184 }
1185 static int
1186 read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset)
1187 {
1188 if (mode == SSI_OFFSET_I32) {
1189 ret_offset->mode = SSI_OFFSET_I32;
1190 if (! read_i32(fp, &(ret_offset->off.i32))) return 0;
1191 } else if (mode == SSI_OFFSET_I64) {
1192 ret_offset->mode = SSI_OFFSET_I64;
1193 if (! read_i64(fp, &(ret_offset->off.i64))) return 0;
1194 } else return 0;
1195
1196 return 1;
1197 }
1198 static int
1199 write_offset(FILE *fp, SSIOFFSET *offset)
1200 {
1201 if (offset->mode == SSI_OFFSET_I32) return write_i32(fp, offset->off.i32);
1202 else if (offset->mode == SSI_OFFSET_I64) return write_i64(fp, offset->off.i64);
1203 else abort();
1204 /*UNREACHED*/
1205 return 1; /* silence bitchy compilers */
1206 }
1207
1208 static int
1209 parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey)
1210 {
1211 char *s, *tok;
1212 int n;
1213
1214 s = buf;
1215 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1216 pkey->key = tok;
1217 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1218 pkey->fnum = (sqd_uint16) atoi(tok);
1219
1220 if (mode == SSI_OFFSET_I32) {
1221 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1222 pkey->r_off.mode = mode;
1223 pkey->r_off.off.i32 = (sqd_uint32) strtoul(tok, NULL, 10);
1224 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1225 pkey->d_off.mode = mode;
1226 pkey->d_off.off.i32 = (sqd_uint32) strtoul(tok, NULL, 10);
1227 }
1228 #ifdef HAS_64BIT_FILE_OFFSETS
1229 else {
1230 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1231 pkey->r_off.mode = mode;
1232 pkey->r_off.off.i64 = (sqd_uint64) strtoull(tok, NULL, 10);
1233 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1234 pkey->d_off.mode = mode;
1235 pkey->d_off.off.i64 = (sqd_uint64) strtoull(tok, NULL, 10);
1236 }
1237 #else
1238 else {
1239 return SSI_ERR_NO64BIT;
1240 }
1241 #endif
1242 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1243 pkey->len = (sqd_uint32) strtoul(tok, NULL, 10);
1244
1245 return 0;
1246 }
1247 static int
1248 parse_skey_info(char *buf, struct ssiskey_s *skey)
1249 {
1250 char *s, *tok;
1251 int n;
1252
1253 s = buf;
1254 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1255 skey->key = tok;
1256 if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT;
1257 skey->pkey = tok;
1258 return 0;
1259 }
1260
1261 /* Function: binary_search()
1262 * Date: SRE, Sun Dec 31 16:05:03 2000 [St. Louis]
1263 *
1264 * Purpose: Find a key in a SSI index, by a binary search
1265 * in an alphabetically sorted list of keys. If successful,
1266 * return 0, and the index file is positioned to read
1267 * the rest of the data for that key. Else returns nonzero.
1268 *
1269 * Args: sfp - an open SSIFILE
1270 * key - key to find
1271 * klen - key length to allocate (plen or slen from sfp)
1272 * base - base offset (poffset or soffset)
1273 * recsize - size of each key record in bytes (precsize or srecsize)
1274 * maxidx - # of keys (nprimary or nsecondary)
1275 *
1276 * Returns: 0 on success, and leaves file positioned for reading remaining
1277 * data for the key.
1278 * Nonzero on failure:
1279 * SSI_ERR_NO_SUCH_KEY - that key's not in the index
1280 * SSI_ERR_MALLOC - a memory allocation failure
1281 * SSI_ERR_NODATA - an fread() failed
1282 */
1283 static int
1284 binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base,
1285 sqd_uint32 recsize, sqd_uint32 maxidx)
1286 {
1287 char *name;
1288 sqd_uint32 left, right, mid;
1289 int cmp;
1290 int status;
1291
1292 if (maxidx == 0) return SSI_ERR_NO_SUCH_KEY; /* special case: empty index */
1293 if ((name = malloc (sizeof(char)*klen)) == NULL) return SSI_ERR_MALLOC;
1294 left = 0;
1295 right = maxidx-1;
1296 while (1) { /* A binary search: */
1297 mid = (left+right) / 2; /* careful here. only works because
1298 we limit unsigned vars to signed ranges. */
1299 if ((status = indexfile_position(sfp, base, recsize, mid)) != 0)
1300 { free(name); return status; }
1301 if (fread(name, sizeof(char), klen, sfp->fp) != klen)
1302 { free(name); return SSI_ERR_NODATA; }
1303 cmp = strcmp(name, key);
1304 if (cmp == 0) break; /* found it! */
1305 else if (left >= right) /* oops, missed it; fail */
1306 { free(name); return SSI_ERR_NO_SUCH_KEY; }
1307 else if (cmp < 0) left = mid+1; /* it's right of mid */
1308 else if (cmp > 0) {
1309 if (mid == 0) { free(name); return SSI_ERR_NO_SUCH_KEY; } /* special case, beware */
1310 else right = mid-1; /* it's left of mid */
1311 }
1312 }
1313 free(name);
1314 return 0; /* and sfp->fp is positioned... */
1315 }
1316
1317 /* Function: indexfile_position()
1318 * Date: SRE, Mon Jan 1 19:32:49 2001 [St. Louis]
1319 *
1320 * Purpose: Position the open index file {sfp} at the start
1321 * of record {n} in a list of records that starts at
1322 * base offset {base}, where each record takes up {l}
1323 * bytes. (e.g. the position is byte (base + n*l)).
1324 *
1325 * Args: sfp - open SSIFILE
1326 * base - offset of record 0 (e.g. sfp->foffset)
1327 * len - size of each record in bytes (e.g. sfp->frecsize)
1328 * n - which record to get (e.g. 0..sfp->nfiles)
1329 *
1330 * Returns: 0 on success, non-zero on failure.
1331 */
1332 static int
1333 indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n)
1334 {
1335 SSIOFFSET pos;
1336 int status;
1337
1338 if (base->mode == SSI_OFFSET_I32) {
1339 pos.mode = SSI_OFFSET_I32;
1340 pos.off.i32 = base->off.i32 + n*len;
1341 } else if (base->mode == SSI_OFFSET_I64) {
1342 pos.mode = SSI_OFFSET_I64;
1343 pos.off.i64 = base->off.i64 + n*len;
1344 } else return 0;
1345 if ((status = SSISetFilePosition(sfp->fp, &pos)) != 0) return status;
1346 return 0;
1347 }
1348
1349 /* Function: current_index_size()
1350 * Date: SRE, Tue Feb 20 18:23:30 2001 [St. Louis]
1351 *
1352 * Purpose: Calculates the size of the current index,
1353 * in megabytes.
1354 */
1355 static sqd_uint64
1356 current_index_size(SSIINDEX *g)
1357 {
1358 sqd_uint64 frecsize, precsize, srecsize;
1359 sqd_uint64 total;
1360
1361 /* Magic-looking numbers come from adding up sizes
1362 * of things in bytes
1363 */
1364 frecsize = 16 + g->flen;
1365 precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen;
1366 srecsize = g->plen+g->slen;
1367 total = (66L + /* header size, if 64bit index offsets */
1368 frecsize * g->nfiles + /* file section size */
1369 precsize * g->nprimary + /* primary key section size */
1370 srecsize * g->nsecondary) / /* secondary key section size */
1371 1048576L;
1372 return total;
1373 }
1374 /* Function: activate_external_sort()
1375 * Date: SRE, Mon Feb 4 09:08:08 2002 [St. Louis]
1376 *
1377 * Purpose: Switch to external sort mode.
1378 * Open file handles for external index files (ptmp, stmp).
1379 * Flush current index information to these files.
1380 * Free current memory, turn over control to the tmpfiles.
1381 *
1382 * Return: 0 on success; non-zero on failure.
1383 */
1384 static int
1385 activate_external_sort(SSIINDEX *g)
1386 {
1387 int i;
1388 /* it's a bit late to be checking this, but... */
1389 if (g->external) return 0; /* we already are external, fool */
1390 if (FileExists(g->ptmpfile)) return 1;
1391 if (FileExists(g->stmpfile)) return 1;
1392 if ((g->ptmp = fopen(g->ptmpfile, "w")) == NULL) return 1;
1393 if ((g->stmp = fopen(g->stmpfile, "w")) == NULL) return 1;
1394
1395 /* Flush the current indices.
1396 */
1397 SQD_DPRINTF1(("Switching to external sort - flushing ssiindex to disk...\n"));
1398 for (i = 0; i < g->nprimary; i++) {
1399 if (g->smode == SSI_OFFSET_I32) {
1400 fprintf(g->ptmp, "%s\t%u\t%lu\t%lu\t%lu\n",
1401 g->pkeys[i].key, g->pkeys[i].fnum,
1402 (unsigned long) g->pkeys[i].r_off.off.i32,
1403 (unsigned long) g->pkeys[i].d_off.off.i32,
1404 (unsigned long) g->pkeys[i].len);
1405 } else {
1406 fprintf(g->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n",
1407 g->pkeys[i].key, g->pkeys[i].fnum,
1408 (unsigned long long) g->pkeys[i].r_off.off.i64,
1409 (unsigned long long) g->pkeys[i].d_off.off.i64,
1410 (unsigned long) g->pkeys[i].len);
1411 }
1412 }
1413 for (i = 0; i < g->nsecondary; i++)
1414 fprintf(g->stmp, "%s\t%s\n", g->skeys[i].key, g->skeys[i].pkey);
1415
1416 /* Free the memory now that we've flushed our lists to disk
1417 */
1418 for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key);
1419 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key);
1420 for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey);
1421 if (g->pkeys != NULL) free(g->pkeys);
1422 if (g->skeys != NULL) free(g->skeys);
1423 g->pkeys = NULL;
1424 g->skeys = NULL;
1425
1426 /* Turn control over to external accumulation mode.
1427 */
1428 g->external = TRUE;
1429 return 0;
1430 }
1431
1432
1433 /*****************************************************************
1434 * Debugging API
1435 *****************************************************************/
1436 void
1437 SSIForceExternalSort(SSIINDEX *g)
1438 {
1439 if (activate_external_sort(g) != 0)
1440 Die("failed to turn external sorting on.");
1441 }
1442
1443
1444 /*****************************************************************
1445 * Test driving mode
1446 *****************************************************************/
1447 #ifdef MUGGINS_LETS_ME_SLEEP
1448 /* Minimally:
1449 cc -g -Wall -o shiva -DDEBUGLEVEL=1 -DMUGGINS_LETS_ME_SLEEP ssi.c sqerror.c sre_string.c types.c sre_ctype.c sre_math.c file.c -lm
1450 */
1451
1452 int
1453 main(int argc, char **argv)
1454 {
1455 char name[32], accession[32];
1456 SSIINDEX *ssi;
1457 int mode;
1458 SSIOFFSET r_off, d_off;
1459 FILE *ofp;
1460 int i;
1461 int fh; /* a file handle */
1462 int status; /* return status from a SSI call */
1463
1464 mode = SSI_OFFSET_I32;
1465 if ((ssi = SSICreateIndex(mode)) == NULL)
1466 Die("Failed to allocate SSI index");
1467
1468 /* Generate two FASTA files, tmp.0 and tmp.1, and index them.
1469 */
1470 if ((ofp = fopen("tmp.0", "w")) == NULL)
1471 Die("failed to open tmp.0");
1472 if ((status = SSIAddFileToIndex(ssi, "tmp.0", SQFILE_FASTA, &fh)) != 0)
1473 Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status));
1474 for (i = 0; i < 10; i++) {
1475 if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0)
1476 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1477 sprintf(name, "seq%d", i);
1478 sprintf(accession, "ac%d", i);
1479 fprintf(ofp, ">%s [%s] Description? we don't need no steenking description.\n",
1480 name, accession);
1481 if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0)
1482 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1483 fprintf(ofp, "AAAAAAAAAA\n");
1484 fprintf(ofp, "CCCCCCCCCC\n");
1485 fprintf(ofp, "GGGGGGGGGG\n");
1486 fprintf(ofp, "TTTTTTTTTT\n");
1487
1488 if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0)
1489 Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status));
1490 if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0)
1491 Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status));
1492 }
1493 SSISetFileForSubseq(ssi, fh, 11, 10);
1494 fclose(ofp);
1495
1496 if ((ofp = fopen("tmp.1", "w")) == NULL)
1497 Die("failed to open tmp.1");
1498 if ((status = SSIAddFileToIndex(ssi, "tmp.1", SQFILE_FASTA, &fh)) != 0)
1499 Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status));
1500 for (i = 10; i < 20; i++) {
1501 if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0)
1502 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1503 sprintf(name, "seq%d", i);
1504 sprintf(accession, "ac%d", i);
1505 fprintf(ofp, ">%s [%s] i/o, i/o, it's off to disk we go.\n",
1506 name, accession);
1507 if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0)
1508 Die("SSIGetFilePosition() failed: %s", SSIErrorString(status));
1509 fprintf(ofp, "AAAAAAAAAA 10\n");
1510 fprintf(ofp, "CCCCCCCCCC 20\n");
1511 fprintf(ofp, "GGGGGGGGGG 30\n");
1512 fprintf(ofp, "TTTTTTTTTT 40\n");
1513
1514 if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0)
1515 Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status));
1516 if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0)
1517 Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status));
1518 }
1519 SSISetFileForSubseq(ssi, fh, 14, 10);
1520 fclose(ofp);
1521
1522 /* Write the index to tmp.ssi
1523 */
1524 if ((status = SSIWriteIndex("tmp.ssi", ssi)) != 0)
1525 Die("SSIWriteIndex() failed: %s", SSIErrorString(status));
1526 SSIFreeIndex(ssi);
1527
1528 /* Now reopen the index and run some tests.
1529 */
1530 exit(0);
1531 }
1532
1533
1534 #endif /* test driving code */
1535
1536
1537