Mercurial > repos > clustalomega > clustalomega
diff clustalomega/clustal-omega-0.2.0/src/squid/ssi.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustalomega/clustal-omega-0.2.0/src/squid/ssi.c Tue Jun 07 17:04:25 2011 -0400 @@ -0,0 +1,1537 @@ +/***************************************************************** + * SQUID - a library of functions for biological sequence analysis + * Copyright (C) 1992-2002 Washington University School of Medicine + * + * This source code is freely distributed under the terms of the + * GNU General Public License. See the files COPYRIGHT and LICENSE + * for details. + *****************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include "squid.h" +#include "ssi.h" +#ifdef CLUSTALO +#include <limits.h> +#endif + +static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */ +static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */ + +static int read_i16(FILE *fp, sqd_uint16 *ret_result); +static int read_i32(FILE *fp, sqd_uint32 *ret_result); +static int read_i64(FILE *fp, sqd_uint64 *ret_result); +static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset); +static int write_i16(FILE *fp, sqd_uint16 n); +static int write_i32(FILE *fp, sqd_uint32 n); +static int write_i64(FILE *fp, sqd_uint64 n); +static int write_offset(FILE *fp, SSIOFFSET *offset); +static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, + sqd_uint32 recsize, sqd_uint32 maxidx); +static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, + sqd_uint32 n); +static void clear_ssifile(SSIFILE *sfp); +static sqd_uint64 current_index_size(SSIINDEX *g); +static int activate_external_sort(SSIINDEX *g); +static int load_indexfile(SSIFILE *sfp); +static int parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey); +static int parse_skey_info(char *buf, struct ssiskey_s *skey); + +/* Function: SSIOpen() + * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis] + * + * Purpose: Opens the SSI index file {filename} and returns + * a SSIFILE * stream thru {ret_sfp}. + * The caller must eventually close this stream using + * SSIClose(). More than one index file can be open + * at once. + * + * Args: filename - full path to a SSI index file + * + * Returns: Returns 0 on success, nonzero on failure. + */ +int +SSIOpen(char *filename, SSIFILE **ret_sfp) +{ + SSIFILE *sfp = NULL; + int status; + if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; + if ((sfp->fp = fopen(filename, "rb")) == NULL) { + free(sfp); + return SSI_ERR_NOFILE; + } + status = load_indexfile(sfp); + *ret_sfp = sfp; + return status; +} +/* load_indexfile(): given a SSIFILE structure with an open and positioned + * stream (fp) -- but no other data loaded -- read the next SSIFILE + * in from disk. We use this routine without its SSIOpen() wrapper + * as part of the external mergesort when creating large indices. + */ +static int +load_indexfile(SSIFILE *sfp) +{ + sqd_uint32 magic; + sqd_uint16 i; /* counter over files */ + int status; /* overall return status if an error is thrown */ + + status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */ + + sfp->filename = NULL; + sfp->fileformat = NULL; + sfp->fileflags = NULL; + sfp->bpl = NULL; + sfp->rpl = NULL; + sfp->nfiles = 0; + if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; } + if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; } + if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE; + + /* If we have 64-bit offsets, make sure we can deal with them. + */ +#ifndef HAS_64BIT_FILE_OFFSETS + if ((sfp->flags & SSI_USE64_INDEX) || + (sfp->flags & SSI_USE64)) + { status = SSI_ERR_NO64BIT; goto FAILURE; } +#endif + + sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; + sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; + + if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE; + + if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE; + if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE; + if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE; + + /* Read the file information and keep it. + * We expect the number of files to be small, so reading it + * once should be advantageous overall. If SSI ever had to + * deal with large numbers of files, you'd probably want to + * read file information on demand. + */ + if (sfp->nfiles == 0) goto FAILURE; + if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL; + if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + + for (i = 0; i < sfp->nfiles; i++) + { + /* We have to explicitly position, because header and file + * records may expand in the future; frecsize and foffset + * give us forwards compatibility. + */ + if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE; + if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE; + } + + /* Success. Return 0. + */ + return 0; + + FAILURE: + /* Failure: free the damaged structure, return status code. + */ + SSIClose(sfp); + return status; +} + + + +/* Function: SSIGetOffsetByName() + * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis] + * + * Purpose: Looks up the string {key} in the open index {sfp}. + * {key} can be either a primary or secondary key. If {key} + * is found, {*ret_fh} contains a unique handle on + * the file that contains {key} (suitable for an SSIFileInfo() + * call, or for comparison to the handle of the last file + * that was opened for retrieval), and {offset} is filled + * in with the offset in that file. + * + * Args: sfp - open index file + * key - string to search for + * ret_fh - RETURN: handle on file that key is in + * ret_offset - RETURN: offset of the start of that key's record + * + * Returns: 0 on success. + * non-zero on error. + */ +int +SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh, + SSIOFFSET *ret_offset) +{ + int status; + sqd_uint16 fnum; + + /* Look in the primary keys. + */ + status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize, + sfp->nprimary); + if (status == 0) { + /* We found it as a primary key; get our data & return. + */ + if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; + *ret_fh = (int) fnum; + if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; + + return 0; /* success! (we don't need the other key data) */ + } else if (status == SSI_ERR_NO_SUCH_KEY) { + /* Not in the primary keys? OK, try the secondary keys. + */ + if (sfp->nsecondary > 0) { + char *pkey; + status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize, + sfp->nsecondary); + if (status != 0) return status; + if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; + if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; + + status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset); + free(pkey); + } + return status; + + } else return status; + /*NOTREACHED*/ +} + +/* Function: SSIGetOffsetByNumber() + * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis] + * + * Purpose: Looks up primary key #{n} in the open index {sfp}. + * {n} ranges from 0..nprimary-1. When key #{n} + * is found, {*ret_fh} contains a unique + * handle on the file that contains {key} (suitable + * for an SSIFileInfo() call, or for comparison to + * the handle of the last file that was opened for retrieval), + * and {offset} is filled in with the offset in that file. + * + * Args: sfp - open index file + * n - primary key number to retrieve. + * ret_fh - RETURN: handle on file that key is in + * ret_offset - RETURN: offset of the start of that key's record + * + * Returns: 0 on success. + * non-zero on error. + */ +int +SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset) +{ + sqd_uint16 fnum; + char *pkey; + + if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY; + if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0) + return SSI_ERR_SEEK_FAILED; + + if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; + if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; + if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; + if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; + *ret_fh = fnum; + free(pkey); + return 0; +} + +/* Function: SSIGetSubseqOffset() + * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis] + * + * Purpose: Implements SSI_FAST_SUBSEQ. + * + * Looks up a primary or secondary {key} in the open + * index {sfp}. Asks for the nearest offset to a + * subsequence starting at position {requested_start} + * in the sequence (numbering the sequence 1..L). + * If {key} is found, on return, {ret_fh} + * contains a unique handle on the file that contains + * {key} (suitable for an SSIFileInfo() call, or for + * comparison to the handle of the last file that was + * opened for retrieval); {record_offset} contains the + * disk offset to the start of the record; {data_offset} + * contains the disk offset either exactly at the requested + * residue, or at the start of the line containing the + * requested residue; {ret_actual_start} contains the + * coordinate (1..L) of the first valid residue at or + * after {data_offset}. {ret_actual_start} is <= + * {requested_start}. + * + * Args: sfp - open index file + * key - primary or secondary key to find + * requested_start - residue we'd like to start at (1..L) + * ret_fh - RETURN: handle for file the key is in + * record_offset - RETURN: offset of entire record + * data_offset - RETURN: offset of subseq (see above) + * ret_actual_start- RETURN: coord (1..L) of residue at data_offset + * + * Returns: 0 on success, non-zero on failure. + */ +int +SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start, + int *ret_fh, SSIOFFSET *record_offset, + SSIOFFSET *data_offset, int *ret_actual_start) +{ + int status; + sqd_uint32 len; + int r, b, i, l; /* tmp variables for "clarity", to match docs */ + + /* Look up the key. Rely on the fact that SSIGetOffsetByName() + * leaves the index file positioned at the rest of the data for this key. + */ + status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset); + if (status != 0) return status; + + /* Check that we're allowed to do subseq lookup on that file. + */ + if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ)) + return SSI_ERR_NO_SUBSEQS; + + /* Read the data we need for subseq lookup + */ + if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA; + if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA; + + /* Set up tmp variables for clarity of equations below, + * and to make them match documentation (ssi-format.tex). + */ + r = sfp->rpl[*ret_fh]; /* residues per line */ + b = sfp->bpl[*ret_fh]; /* bytes per line */ + i = requested_start; /* start position 1..L */ + l = (i-1)/r; /* data line # (0..) that the residue is on */ + if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS; + if (i < 0 || i > len) return SSI_ERR_RANGE; + + /* When b = r+1, there's nothing but sequence on each data line (and the \0), + * and we can find each residue precisely. + */ + if (b == r+1) { + if (sfp->smode == SSI_OFFSET_I32) { + data_offset->mode = SSI_OFFSET_I32; + data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r; + } else if (sfp->smode == SSI_OFFSET_I64) { + data_offset->mode = SSI_OFFSET_I64; + data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r; + } + *ret_actual_start = requested_start; + } else { + /* else, there's other stuff on seq lines, so the best + * we can do easily is to position at start of relevant line. + */ + if (sfp->smode == SSI_OFFSET_I32) { + data_offset->mode = SSI_OFFSET_I32; + data_offset->off.i32 = data_offset->off.i32 + l*b; + } else if (sfp->smode == SSI_OFFSET_I64) { + data_offset->mode = SSI_OFFSET_I64; + data_offset->off.i64 = data_offset->off.i64 + l*b; + } + /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */ + *ret_actual_start = 1 + l*r; + } + return 0; +} + +/* Function: SSISetFilePosition() + * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis] + * + * Purpose: Uses {offset} to sets the file position for {fp}, usually an + * open sequence file, relative to the start of the file. + * Hides the details of system-dependent shenanigans necessary for + * file positioning in large (>2 GB) files. + * + * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit + * offsets and <2 GB files. + * + * Warning: if all else fails, in desperation, it will try to + * use fsetpos(). This requires making assumptions about fpos_t + * that may be unwarranted... assumptions that ANSI C prohibits + * me from making... though I believe the ./configure + * script robustly tests whether I can play with fpos_t like this. + * + * Args: fp - file to position. + * offset - SSI offset relative to file start. + * + * Returns: 0 on success, nonzero on error. + */ +int +SSISetFilePosition(FILE *fp, SSIOFFSET *offset) +{ + if (offset->mode == SSI_OFFSET_I32) { + if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; + } +#ifndef HAS_64BIT_FILE_OFFSETS + else return SSI_ERR_NO64BIT; +#elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8 + else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; +#elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8 + else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; +#elif defined HAVE_FSEEK64 + else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; +#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 + else if (fsetpos(fp, &(offset->off.i64)) != 0) return SSI_ERR_SEEK_FAILED; +#endif + return 0; +} + + +/* Function: SSIFileInfo() + * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis] + * + * Purpose: Given a file number {fh} in an open index file + * {sfp}, retrieve file name {ret_filename} and + * the file format {ret_format}. + * + * {ret_filename} is a pointer to a string maintained + * internally by {sfp}. It should not be free'd; + * SSIClose(sfp) takes care of it. + * + * Args: sfp - open index file + * fh - handle on file to look up + * ret_filename - RETURN: name of file n + * ret_format - RETURN: format of file n + * + * Returns: 0 on success, nonzero on failure. + */ +int +SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format) +{ + if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG; + *ret_filename = sfp->filename[fh]; + *ret_format = sfp->fileformat[fh]; + return 0; +} + +/* Function: SSIClose() + * Date: SRE, Sun Dec 31 14:56:37 2000 [St. Louis] + * + * Purpose: Close an open {SSIFILE *}. + * + * Args: sfp - index file to close. + * + * Returns: (void) + */ +void +SSIClose(SSIFILE *sfp) +{ + if (sfp != NULL) { + clear_ssifile(sfp); + if (sfp->fp != NULL) fclose(sfp->fp); + free(sfp); + } +} +/* clear_ssifile(): free the innards of SSIFILE, without + * destroying the structure or closing the stream. + */ +static void +clear_ssifile(SSIFILE *sfp) +{ + int i; + + if (sfp->filename != NULL) { + for (i = 0; i < sfp->nfiles; i++) + if (sfp->filename[i] != NULL) free(sfp->filename[i]); + free(sfp->filename); + } + if (sfp->fileformat != NULL) free(sfp->fileformat); + if (sfp->fileflags != NULL) free(sfp->fileflags); + if (sfp->bpl != NULL) free(sfp->bpl); + if (sfp->rpl != NULL) free(sfp->rpl); +} + + +/* Function: SSIRecommendMode() + * Date: SRE, Fri Feb 16 08:23:47 2001 [St. Louis] + * + * Purpose: Examines the file and determines whether it should be + * indexed with large file support or not; returns + * SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large + * files, or -1 on failure. + * + * Args: file - name of file to check for size + * + * Returns: -1 on failure (including case where file is too big) + * SSI_OFFSET_I32 for most files (<= 2^31-1 bytes) + * SSI_OFFSET_I64 for large files (> 2^31-1 bytes) + */ +int +SSIRecommendMode(char *file) +{ +#if HAVE_STAT64 + struct stat64 s1; + if (stat64(file, &s1) == 0) { + if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32; + else return SSI_OFFSET_I64; + } +#else + struct stat s2; + if (stat(file, &s2) == 0) { + if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32; + else return SSI_OFFSET_I64; + } +#endif + return -1; +} + + +/* Function: SSICreateIndex() + * Date: SRE, Tue Jan 2 11:23:25 2001 [St. Louis] + * + * Purpose: Creates and initializes a SSI index structure. + * Sequence file offset type is specified by {mode}. + * + * Args: mode - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode. + * + * Returns: ptr to new index structure, or NULL on failure. + * Caller is responsible for free'ing the returned + * structure with SSIFreeIndex(). + */ +SSIINDEX * +SSICreateIndex(int mode) +{ + SSIINDEX *g; + + g = NULL; + if ((g = malloc(sizeof(SSIINDEX))) == NULL) goto FAILURE; + g->smode = mode; + g->imode = SSI_OFFSET_I32; /* index always starts as 32-bit; may get upgraded later */ + g->external = FALSE; + g->max_ram = SSI_MAXRAM; + +#ifndef HAS_64BIT_FILE_OFFSETS + if (mode == SSI_OFFSET_I64) + Die("\ +Can't create a 64-bit SSI index on this system, sorry;\n\ +I don't have 64-bit file offset functions available.\n"); +#endif + + g->filenames = NULL; + g->fileformat = NULL; + g->bpl = NULL; + g->rpl = NULL; + g->flen = 0; + g->nfiles = 0; + + g->pkeys = NULL; + g->plen = 0; + g->nprimary = 0; + g->ptmpfile = "tmp.ssi.1"; /* hardcoded, for now. */ + g->ptmp = NULL; + + g->skeys = NULL; + g->slen = 0; + g->nsecondary = 0; + g->stmpfile = "tmp.ssi.2"; /* hardcoded, for now. */ + g->stmp = NULL; + + /* All mallocs must go after NULL initializations, because of the cleanup strategy; + * we'll try to free anything non-NULL if a malloc fails. + */ + if ((g->filenames = malloc(sizeof(char *) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + if ((g->bpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + if ((g->rpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + + if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; + if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; + + return g; + + FAILURE: + SSIFreeIndex(g); /* free the damaged structure */ + return NULL; +} + +/* Function: SSIGetFilePosition() + * Date: SRE, Tue Jan 2 09:59:26 2001 [St. Louis] + * + * Purpose: Fills {ret_offset} with the current disk + * offset of {fp}, relative to the start of the file. + * {mode} is set to either SSI_OFFSET_I32 or + * SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps + * a call to ftell(); otherwise deals with system-dependent + * details of 64-bit file offsets. + * + * Args: fp - open stream + * mode - SSI_OFFSET_I32 or SSI_OFFSET_I64 + * ret_offset - RETURN: file position + * + * Returns: 0 on success. nonzero on error. + */ +int +SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset) +{ + if (mode == SSI_OFFSET_I32) + { + ret_offset->mode = SSI_OFFSET_I32; + ret_offset->off.i32 = ftell(fp); + if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED; + } + else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */ + else { + ret_offset->mode = SSI_OFFSET_I64; +#ifndef HAS_64BIT_FILE_OFFSETS + return SSI_ERR_NO64BIT; +#elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8 + if ((ret_offset->off.i64 = ftello(fp)) == -1) return SSI_ERR_TELL_FAILED; +#elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8 + if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED; +#elif defined HAVE_FTELL64 + if ((ret_offset->off.i64 = ftell64(fp)) == -1) return SSI_ERR_TELL_FAILED; +#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 + if (fgetpos(fp, &(ret_offset->off.i64)) != 0) return SSI_ERR_TELL_FAILED; +#endif + } + return 0; +} + +/* Function: SSIAddFileToIndex() + * Date: SRE, Tue Jan 2 12:54:36 2001 [St. Louis] + * + * Purpose: Adds the sequence file {filename}, which is known to + * be in format {fmt}, to the index {g}. Creates and returns + * a unique filehandle {fh} for then associating primary keys + * with this file using SSIAddPrimaryKeyToIndex(). + * + * Args: g - active index + * filename - file to add + * fmt - format code for this file (e.g. SQFILE_FASTA) + * ret_fh - RETURN: unique handle for this file + * + * Returns: 0 on success; nonzero on error. + */ +int +SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh) +{ + int n; + + if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; + + n = strlen(filename); + if ((n+1) > g->flen) g->flen = n+1; + + g->filenames[g->nfiles] = FileTail(filename, FALSE); + g->fileformat[g->nfiles] = fmt; + g->bpl[g->nfiles] = 0; + g->rpl[g->nfiles] = 0; + *ret_fh = g->nfiles; /* handle is simply = file number */ + g->nfiles++; + + if (g->nfiles % SSI_FILE_BLOCK == 0) { + g->filenames = realloc(g->filenames, sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->filenames == NULL) return SSI_ERR_MALLOC; + g->fileformat= realloc(g->fileformat, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->fileformat == NULL) return SSI_ERR_MALLOC; + g->bpl = realloc(g->bpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->bpl == NULL) return SSI_ERR_MALLOC; + g->rpl = realloc(g->rpl, sizeof(sqd_uint32) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->rpl == NULL) return SSI_ERR_MALLOC; + } + return 0; +} + + +/* Function: SSISetFileForSubseq() + * Date: SRE, Tue Jan 9 10:02:05 2001 [St. Louis] + * + * Purpose: Set SSI_FAST_SUBSEQ for the file indicated by + * filehandle {fh} in the index {g}, setting + * parameters {bpl} and {rpl} to the values given. + * {bpl} is the number of bytes per sequence data line. + * {rpl} is the number of residues per sequence data line. + * Caller must be sure that {bpl} and {rpl} do not change + * on any line of any sequence record in the file + * (except for the last data line of each record). If + * this is not the case in this file, SSI_FAST_SUBSEQ + * will not work, and this routine should not be + * called. + * + * Args: g - the active index + * fh - handle for file to set SSI_FAST_SUBSEQ on + * bpl - bytes per data line + * rpl - residues per data line + * + * Returns: 0 on success; 1 on error. + */ +int +SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl) +{ + if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG; + if (bpl <= 0 || rpl <= 0) return SSI_ERR_BADARG; + g->bpl[fh] = bpl; + g->rpl[fh] = rpl; + return 0; +} + + +/* Function: SSIAddPrimaryKeyToIndex() + * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis] + * + * Purpose: Put primary key {key} in the index {g}, while telling + * the index this primary key is in the file associated + * with filehandle {fh} (returned by a previous call + * to SSIAddFileToIndex()), and its record starts at + * position {r_off} in the file. + * + * {d_off} and {L} are optional; they may be left unset + * by passing NULL and 0, respectively. (If one is + * provided, both must be provided.) If they are provided, + * {d_off} gives the position of the first line of sequence + * data in the record, and {L} gives the length of + * the sequence in residues. They are used when + * SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ + * is not set for the file, {d_off} and {L} will be + * ignored by the index reading API even if they are stored + * by the index writing API, so it doesn't hurt for the + * indexing program to provide them; typically they + * won't know whether it's safe to set SSI_FAST_SUBSEQ + * for the whole file until the whole file has been + * read and every key has already been added to the index. + * + * Args: g - active index + * key - primary key to add + * fh - handle on file that this key's in + * r_off - offset to start of record + * d_off - offset to start of sequence data + * L - length of sequence, or 0 + * + * Returns: 0 on success, nonzero on error. + */ +int +SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh, + SSIOFFSET *r_off, SSIOFFSET *d_off, int L) +{ + int n; /* a string length */ + + if (fh >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; + if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; + if (L > 0 && d_off == NULL) abort(); /* need both. */ + + /* Before adding the key: check how big our index is. + * If it's getting too large, switch to external mode. + */ + if (!g->external && current_index_size(g) >= g->max_ram) + if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE; + + /* Update maximum pkey length, if needed. + */ + n = strlen(key); + if ((n+1) > g->plen) g->plen = n+1; + + /* External mode? Simply append to disk... + */ + if (g->external) { + if (g->smode == SSI_OFFSET_I32) { + fprintf(g->ptmp, "%s\t%d\t%lu\t%lu\t%lu\n", + key, fh, (unsigned long) r_off->off.i32, + (unsigned long) (d_off == NULL? 0 : d_off->off.i32), + (unsigned long) L); + } else { +#ifdef CLUSTALO + fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", + key, fh, (unsigned long long)r_off->off.i64, + d_off == NULL? 0 : (unsigned long long) d_off->off.i64, + (unsigned long) L); +#else + fprintf(g->ptmp, "%s\t%d\t%llu\t%llu\t%lu\n", + key, fh, r_off->off.i64, + d_off == NULL? 0 : d_off->off.i64, + (unsigned long) L); +#endif + } + g->nprimary++; + return 0; + } + + /* Else: internal mode, keep keys in memory... + */ + if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; + g->pkeys[g->nprimary].fnum = (sqd_uint16) fh; + g->pkeys[g->nprimary].r_off = *r_off; + if (d_off != NULL && L > 0) { + g->pkeys[g->nprimary].d_off = *d_off; + g->pkeys[g->nprimary].len = L; + } else { + /* yeah, this looks stupid, but look: we have to give a valid + looking, non-NULL d_off of some sort, or writes will fail. + It's going to be unused anyway. */ + g->pkeys[g->nprimary].d_off = *r_off; + g->pkeys[g->nprimary].len = 0; + } + g->nprimary++; + + if (g->nprimary % SSI_KEY_BLOCK == 0) { + g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK)); + if (g->pkeys == NULL) return SSI_ERR_MALLOC; + } + return 0; +} + + +/* Function: SSIAddSecondaryKeyToIndex() + * Date: SRE, Tue Jan 2 12:44:40 2001 [St. Louis] + * + * Purpose: Puts secondary key {key} in the index {g}, associating + * it with primary key {pkey} that was previously + * registered by SSIAddPrimaryKeyToIndex(). + * + * Args: g - active index + * key - secondary key to add + * pkey - primary key to associate this key with + * + * Returns: 0 on success, nonzero on failure. + */ +int +SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey) +{ + int n; /* a string length */ + + if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; + + /* Before adding the key: check how big our index is. + * If it's getting too large, switch to external mode. + */ + if (!g->external && current_index_size(g) >= g->max_ram) + if (activate_external_sort(g) != 0) return SSI_ERR_NOFILE; + + /* Update maximum secondary key length, if necessary. + */ + n = strlen(key); + if ((n+1) > g->slen) g->slen = n+1; + + /* if external mode: write info to disk. + */ + if (g->external) { + fprintf(g->stmp, "%s\t%s\n", key, pkey); + g->nsecondary++; + return 0; + } + + /* else, internal mode... store info in memory. + */ + if ((g->skeys[g->nsecondary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; + if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC; + g->nsecondary++; + + if (g->nsecondary % SSI_KEY_BLOCK == 0) { + g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK)); + if (g->skeys == NULL) return SSI_ERR_MALLOC; + } + return 0; +} + + + + +/* Function: SSIWriteIndex() + * Date: SRE, Tue Jan 2 13:55:56 2001 [St. Louis] + * + * Purpose: Writes complete index {g} in SSI format to a + * binary file {file}. Does all + * the overhead of sorting the primary and secondary keys, + * and maintaining the association of secondary keys + * with primary keys during and after the sort. + * + * Args: file - file to write to + * g - index to sort & write out. + * + * Returns: 0 on success, nonzero on error. + */ +/* needed for qsort() */ +static int +pkeysort(const void *k1, const void *k2) +{ + struct ssipkey_s *key1; + struct ssipkey_s *key2; + key1 = (struct ssipkey_s *) k1; + key2 = (struct ssipkey_s *) k2; + return strcmp(key1->key, key2->key); +} +static int +skeysort(const void *k1, const void *k2) +{ + struct ssiskey_s *key1; + struct ssiskey_s *key2; + key1 = (struct ssiskey_s *) k1; + key2 = (struct ssiskey_s *) k2; + return strcmp(key1->key, key2->key); +} +int +SSIWriteIndex(char *file, SSIINDEX *g) +{ + FILE *fp; + int status; + int i; + sqd_uint32 header_flags, file_flags; + sqd_uint32 frecsize, precsize, srecsize; + sqd_uint64 foffset, poffset, soffset; + char *s, *s2; + + if ((fp = fopen(file,"wb")) == NULL) return SSI_ERR_NOFILE; + status = 0; + + /* How big is the index? If it's going to be > 2GB, we need + * to flip to 64-bit index mode. 2047 (instead of 2048) gives us + * some slop room. + * die'ing here is pretty brutal - if we flip to 64-bit index + * mode, we hve 100's of millions of keys, so we've processed + * a long time before reaching this point. Ah well. + */ + if (current_index_size(g) >= 2047) { + g->imode = SSI_OFFSET_I64; +#ifndef HAS_64BIT_FILE_OFFSETS + Die("\ +Can't switch to 64-bit SSI index mode on this system, sorry;\n\ +I don't have 64-bit file offset functions available.\n"); +#endif + } + + /* Magic-looking numbers come from adding up sizes + * of things in bytes + */ + frecsize = 16 + g->flen; + precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen; + srecsize = g->slen + g->plen; + + header_flags = 0; + if (g->smode == SSI_OFFSET_I64) header_flags |= SSI_USE64; + if (g->imode == SSI_OFFSET_I64) header_flags |= SSI_USE64_INDEX; + + /* Magic-looking numbers again come from adding up sizes + * of things in bytes + */ + foffset = (header_flags & SSI_USE64_INDEX) ? 66 : 54; + poffset = foffset + frecsize*g->nfiles; + soffset = poffset + precsize*g->nprimary; + + /* Sort the keys + * If external mode, make system calls to UNIX/POSIX "sort" in place, then + * open new sorted files for reading thru ptmp and stmp handles. + * If internal mode, call qsort. + * + * Note that you'd better force a POSIX locale for the sort; else, + * some silly distro (e.g. Mandrake Linux >=8.1) may have specified + * LC_COLLATE=en_US, and this'll give a sort "bug" in which it doesn't + * sort by byte order. + */ + if (g->external) { + char cmd[1024]; + + fclose(g->ptmp); + g->ptmp = NULL; + sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", g->ptmpfile, g->ptmpfile); + if ((status = system(cmd)) != 0) return SSI_ERR_EXTERNAL_SORT; + if ((g->ptmp = fopen(g->ptmpfile, "r")) == NULL) return SSI_ERR_EXTERNAL_SORT; + + fclose(g->stmp); + g->stmp = NULL; + sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", g->stmpfile, g->stmpfile); + if ((status = system(cmd)) != 0) return SSI_ERR_EXTERNAL_SORT; + if ((g->stmp = fopen(g->stmpfile, "r")) == NULL) return SSI_ERR_EXTERNAL_SORT; + } else { + qsort((void *) g->pkeys, g->nprimary, sizeof(struct ssipkey_s), pkeysort); + qsort((void *) g->skeys, g->nsecondary, sizeof(struct ssiskey_s), skeysort); + } + + /* Write the header + */ + if (! write_i32(fp, v20magic)) return SSI_ERR_FWRITE; + if (! write_i32(fp, header_flags)) return SSI_ERR_FWRITE; + if (! write_i16(fp, g->nfiles)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->nprimary)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->nsecondary)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->flen)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->plen)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->slen)) return SSI_ERR_FWRITE; + if (! write_i32(fp, frecsize)) return SSI_ERR_FWRITE; + if (! write_i32(fp, precsize)) return SSI_ERR_FWRITE; + if (! write_i32(fp, srecsize)) return SSI_ERR_FWRITE; + if (g->imode == SSI_OFFSET_I32) { + if (! write_i32(fp, foffset)) return SSI_ERR_FWRITE; + if (! write_i32(fp, poffset)) return SSI_ERR_FWRITE; + if (! write_i32(fp, soffset)) return SSI_ERR_FWRITE; + } else { + if (! write_i64(fp, foffset)) return SSI_ERR_FWRITE; + if (! write_i64(fp, poffset)) return SSI_ERR_FWRITE; + if (! write_i64(fp, soffset)) return SSI_ERR_FWRITE; + } + + /* The file section + */ + if ((s = malloc(sizeof(char) * g->flen)) == NULL) return SSI_ERR_MALLOC; + for (i = 0; i < g->nfiles; i++) + { + file_flags = 0; + if (g->bpl[i] > 0 && g->rpl[i] > 0) file_flags |= SSI_FAST_SUBSEQ; + + strcpy(s, g->filenames[i]); + if (fwrite(s, sizeof(char), g->flen, fp) != g->flen) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->fileformat[i])) return SSI_ERR_FWRITE; + if (! write_i32(fp, file_flags)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->bpl[i])) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->rpl[i])) return SSI_ERR_FWRITE; + } + free(s); + + /* The primary key section + */ + if ((s = malloc(sizeof(char) * g->plen)) == NULL) return SSI_ERR_MALLOC; + if (g->external) { + char *buf = NULL; + int buflen = 0; + struct ssipkey_s pkey; + for (i = 0; i < g->nprimary; i++) + { + if (sre_fgets(&buf, &buflen, g->ptmp) == NULL) return SSI_ERR_NODATA; + if (parse_pkey_info(buf, g->smode, &pkey) != 0) return SSI_ERR_BADFORMAT; + strcpy(s, pkey.key); + if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; + if (! write_i16( fp, pkey.fnum)) return SSI_ERR_FWRITE; + if (! write_offset(fp, &(pkey.r_off))) return SSI_ERR_FWRITE; + if (! write_offset(fp, &(pkey.d_off))) return SSI_ERR_FWRITE; + if (! write_i32( fp, pkey.len)) return SSI_ERR_FWRITE; + } + free(buf); + } else { + for (i = 0; i < g->nprimary; i++) + { + strcpy(s, g->pkeys[i].key); + if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; + if (! write_i16( fp, g->pkeys[i].fnum)) return SSI_ERR_FWRITE; + if (! write_offset(fp, &(g->pkeys[i].r_off))) return SSI_ERR_FWRITE; + if (! write_offset(fp, &(g->pkeys[i].d_off))) return SSI_ERR_FWRITE; + if (! write_i32( fp, g->pkeys[i].len)) return SSI_ERR_FWRITE; + } + } + + /* The secondary key section + */ + if (g->nsecondary > 0) { + if ((s2 = malloc(sizeof(char) * g->slen)) == NULL) return SSI_ERR_MALLOC; + + if (g->external) { + struct ssiskey_s skey; + char *buf = NULL; + int n = 0; + + for (i = 0; i < g->nsecondary; i++) + { + if (sre_fgets(&buf, &n, g->stmp) == NULL) return SSI_ERR_NODATA; + if (parse_skey_info(buf, &skey) != 0) return SSI_ERR_BADFORMAT; + strcpy(s2, skey.key); + strcpy(s, skey.pkey); + if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE; + if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; + } + free(buf); + } else { + for (i = 0; i < g->nsecondary; i++) + { + strcpy(s2, g->skeys[i].key); + strcpy(s, g->skeys[i].pkey); + if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE; + if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; + } + } + free(s2); + } + + free(s); + fclose(fp); + return status; +} + + +/* Function: SSIFreeIndex() + * Date: SRE, Tue Jan 2 11:44:08 2001 [St. Louis] + * + * Purpose: Free an index structure {g}. + * + * Args: g - ptr to an open index. + * + * Returns: (void) + */ +void +SSIFreeIndex(SSIINDEX *g) +{ + int i; + if (g != NULL) + { + if (g->external == FALSE) { + for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey); + if (g->pkeys != NULL) free(g->pkeys); + if (g->skeys != NULL) free(g->skeys); + } else { + if (g->ptmp != NULL) fclose(g->ptmp); + if (g->stmp != NULL) fclose(g->stmp); +#if DEBUGLEVEL == 0 + remove(g->ptmpfile); + remove(g->stmpfile); +#endif + } + for (i = 0; i < g->nfiles; i++) free(g->filenames[i]); + if (g->filenames != NULL) free(g->filenames); + if (g->fileformat != NULL) free(g->fileformat); + if (g->bpl != NULL) free(g->bpl); + if (g->rpl != NULL) free(g->rpl); + free(g); + } +} + + +/* Function: SSIErrorString() + * Date: SRE, Tue Jan 2 10:38:10 2001 [St. Louis] + * + * Purpose: Returns a ptr to an internal string corresponding + * to error {n}, a code returned from any of the + * functions in the API that return non-zero on error. + * + * Args: n - error code + * + * Returns: ptr to an internal string. + */ +char * +SSIErrorString(int n) +{ + switch (n) { + case SSI_ERR_OK: return "ok (no error)"; + case SSI_ERR_NODATA: return "no data, fread() failed"; + case SSI_ERR_NO_SUCH_KEY: return "no such key"; + case SSI_ERR_MALLOC: return "out of memory, malloc() failed"; + case SSI_ERR_NOFILE: return "file not found, fopen() failed"; + case SSI_ERR_BADMAGIC: return "not a SSI file? (bad magic)"; + case SSI_ERR_BADFORMAT: return "corrupt format? unexpected data"; + case SSI_ERR_NO64BIT: return "no large file support for this system"; + case SSI_ERR_SEEK_FAILED: return "failed to reposition on disk"; + case SSI_ERR_TELL_FAILED: return "failed to get file position on disk"; + case SSI_ERR_NO_SUBSEQS: return "no fast subseq support for this seqfile"; + case SSI_ERR_RANGE: return "subseq start is out of range"; + case SSI_ERR_BADARG: return "an argument is out of range"; + case SSI_ERR_TOOMANY_FILES: return "number of files exceeds limit"; + case SSI_ERR_TOOMANY_KEYS: return "number of keys exceeds limit"; + case SSI_ERR_FWRITE: return "an fwrite() failed"; + case SSI_ERR_EXTERNAL_SORT: return "some problem with external sorting"; + default: return "unrecognized code"; + } + /*NOTREACHED*/ +} + +static int +read_i16(FILE *fp, sqd_uint16 *ret_result) +{ + sqd_uint16 result; + if (fread(&result, sizeof(sqd_uint16), 1, fp) != 1) return 0; + *ret_result = sre_ntoh16(result); + return 1; +} +static int +write_i16(FILE *fp, sqd_uint16 n) +{ + n = sre_hton16(n); + if (fwrite(&n, sizeof(sqd_uint16), 1, fp) != 1) return 0; + return 1; +} +static int +read_i32(FILE *fp, sqd_uint32 *ret_result) +{ + sqd_uint32 result; + if (fread(&result, sizeof(sqd_uint32), 1, fp) != 1) return 0; + *ret_result = sre_ntoh32(result); + return 1; +} +static int +write_i32(FILE *fp, sqd_uint32 n) +{ + n = sre_hton32(n); + if (fwrite(&n, sizeof(sqd_uint32), 1, fp) != 1) return 0; + return 1; +} +static int +read_i64(FILE *fp, sqd_uint64 *ret_result) +{ + sqd_uint64 result; + if (fread(&result, sizeof(sqd_uint64), 1, fp) != 1) return 0; + *ret_result = sre_ntoh64(result); + return 1; +} +static int +write_i64(FILE *fp, sqd_uint64 n) +{ + n = sre_hton64(n); + if (fwrite(&n, sizeof(sqd_uint64), 1, fp) != 1) return 0; + return 1; +} +static int +read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset) +{ + if (mode == SSI_OFFSET_I32) { + ret_offset->mode = SSI_OFFSET_I32; + if (! read_i32(fp, &(ret_offset->off.i32))) return 0; + } else if (mode == SSI_OFFSET_I64) { + ret_offset->mode = SSI_OFFSET_I64; + if (! read_i64(fp, &(ret_offset->off.i64))) return 0; + } else return 0; + + return 1; +} +static int +write_offset(FILE *fp, SSIOFFSET *offset) +{ + if (offset->mode == SSI_OFFSET_I32) return write_i32(fp, offset->off.i32); + else if (offset->mode == SSI_OFFSET_I64) return write_i64(fp, offset->off.i64); + else abort(); + /*UNREACHED*/ + return 1; /* silence bitchy compilers */ +} + +static int +parse_pkey_info(char *buf, char mode, struct ssipkey_s *pkey) +{ + char *s, *tok; + int n; + + s = buf; + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->key = tok; + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->fnum = (sqd_uint16) atoi(tok); + + if (mode == SSI_OFFSET_I32) { + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->r_off.mode = mode; + pkey->r_off.off.i32 = (sqd_uint32) strtoul(tok, NULL, 10); + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->d_off.mode = mode; + pkey->d_off.off.i32 = (sqd_uint32) strtoul(tok, NULL, 10); + } +#ifdef HAS_64BIT_FILE_OFFSETS + else { + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->r_off.mode = mode; + pkey->r_off.off.i64 = (sqd_uint64) strtoull(tok, NULL, 10); + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->d_off.mode = mode; + pkey->d_off.off.i64 = (sqd_uint64) strtoull(tok, NULL, 10); + } +#else + else { + return SSI_ERR_NO64BIT; + } +#endif + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + pkey->len = (sqd_uint32) strtoul(tok, NULL, 10); + + return 0; +} +static int +parse_skey_info(char *buf, struct ssiskey_s *skey) +{ + char *s, *tok; + int n; + + s = buf; + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + skey->key = tok; + if ((tok = sre_strtok(&s, "\t\n", &n)) == NULL) return SSI_ERR_BADFORMAT; + skey->pkey = tok; + return 0; +} + +/* Function: binary_search() + * Date: SRE, Sun Dec 31 16:05:03 2000 [St. Louis] + * + * Purpose: Find a key in a SSI index, by a binary search + * in an alphabetically sorted list of keys. If successful, + * return 0, and the index file is positioned to read + * the rest of the data for that key. Else returns nonzero. + * + * Args: sfp - an open SSIFILE + * key - key to find + * klen - key length to allocate (plen or slen from sfp) + * base - base offset (poffset or soffset) + * recsize - size of each key record in bytes (precsize or srecsize) + * maxidx - # of keys (nprimary or nsecondary) + * + * Returns: 0 on success, and leaves file positioned for reading remaining + * data for the key. + * Nonzero on failure: + * SSI_ERR_NO_SUCH_KEY - that key's not in the index + * SSI_ERR_MALLOC - a memory allocation failure + * SSI_ERR_NODATA - an fread() failed + */ +static int +binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, + sqd_uint32 recsize, sqd_uint32 maxidx) +{ + char *name; + sqd_uint32 left, right, mid; + int cmp; + int status; + + if (maxidx == 0) return SSI_ERR_NO_SUCH_KEY; /* special case: empty index */ + if ((name = malloc (sizeof(char)*klen)) == NULL) return SSI_ERR_MALLOC; + left = 0; + right = maxidx-1; + while (1) { /* A binary search: */ + mid = (left+right) / 2; /* careful here. only works because + we limit unsigned vars to signed ranges. */ + if ((status = indexfile_position(sfp, base, recsize, mid)) != 0) + { free(name); return status; } + if (fread(name, sizeof(char), klen, sfp->fp) != klen) + { free(name); return SSI_ERR_NODATA; } + cmp = strcmp(name, key); + if (cmp == 0) break; /* found it! */ + else if (left >= right) /* oops, missed it; fail */ + { free(name); return SSI_ERR_NO_SUCH_KEY; } + else if (cmp < 0) left = mid+1; /* it's right of mid */ + else if (cmp > 0) { + if (mid == 0) { free(name); return SSI_ERR_NO_SUCH_KEY; } /* special case, beware */ + else right = mid-1; /* it's left of mid */ + } + } + free(name); + return 0; /* and sfp->fp is positioned... */ +} + +/* Function: indexfile_position() + * Date: SRE, Mon Jan 1 19:32:49 2001 [St. Louis] + * + * Purpose: Position the open index file {sfp} at the start + * of record {n} in a list of records that starts at + * base offset {base}, where each record takes up {l} + * bytes. (e.g. the position is byte (base + n*l)). + * + * Args: sfp - open SSIFILE + * base - offset of record 0 (e.g. sfp->foffset) + * len - size of each record in bytes (e.g. sfp->frecsize) + * n - which record to get (e.g. 0..sfp->nfiles) + * + * Returns: 0 on success, non-zero on failure. + */ +static int +indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n) +{ + SSIOFFSET pos; + int status; + + if (base->mode == SSI_OFFSET_I32) { + pos.mode = SSI_OFFSET_I32; + pos.off.i32 = base->off.i32 + n*len; + } else if (base->mode == SSI_OFFSET_I64) { + pos.mode = SSI_OFFSET_I64; + pos.off.i64 = base->off.i64 + n*len; + } else return 0; + if ((status = SSISetFilePosition(sfp->fp, &pos)) != 0) return status; + return 0; +} + +/* Function: current_index_size() + * Date: SRE, Tue Feb 20 18:23:30 2001 [St. Louis] + * + * Purpose: Calculates the size of the current index, + * in megabytes. + */ +static sqd_uint64 +current_index_size(SSIINDEX *g) +{ + sqd_uint64 frecsize, precsize, srecsize; + sqd_uint64 total; + + /* Magic-looking numbers come from adding up sizes + * of things in bytes + */ + frecsize = 16 + g->flen; + precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen; + srecsize = g->plen+g->slen; + total = (66L + /* header size, if 64bit index offsets */ + frecsize * g->nfiles + /* file section size */ + precsize * g->nprimary + /* primary key section size */ + srecsize * g->nsecondary) / /* secondary key section size */ + 1048576L; + return total; +} +/* Function: activate_external_sort() + * Date: SRE, Mon Feb 4 09:08:08 2002 [St. Louis] + * + * Purpose: Switch to external sort mode. + * Open file handles for external index files (ptmp, stmp). + * Flush current index information to these files. + * Free current memory, turn over control to the tmpfiles. + * + * Return: 0 on success; non-zero on failure. + */ +static int +activate_external_sort(SSIINDEX *g) +{ + int i; + /* it's a bit late to be checking this, but... */ + if (g->external) return 0; /* we already are external, fool */ + if (FileExists(g->ptmpfile)) return 1; + if (FileExists(g->stmpfile)) return 1; + if ((g->ptmp = fopen(g->ptmpfile, "w")) == NULL) return 1; + if ((g->stmp = fopen(g->stmpfile, "w")) == NULL) return 1; + + /* Flush the current indices. + */ + SQD_DPRINTF1(("Switching to external sort - flushing ssiindex to disk...\n")); + for (i = 0; i < g->nprimary; i++) { + if (g->smode == SSI_OFFSET_I32) { + fprintf(g->ptmp, "%s\t%u\t%lu\t%lu\t%lu\n", + g->pkeys[i].key, g->pkeys[i].fnum, + (unsigned long) g->pkeys[i].r_off.off.i32, + (unsigned long) g->pkeys[i].d_off.off.i32, + (unsigned long) g->pkeys[i].len); + } else { + fprintf(g->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n", + g->pkeys[i].key, g->pkeys[i].fnum, + (unsigned long long) g->pkeys[i].r_off.off.i64, + (unsigned long long) g->pkeys[i].d_off.off.i64, + (unsigned long) g->pkeys[i].len); + } + } + for (i = 0; i < g->nsecondary; i++) + fprintf(g->stmp, "%s\t%s\n", g->skeys[i].key, g->skeys[i].pkey); + + /* Free the memory now that we've flushed our lists to disk + */ + for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey); + if (g->pkeys != NULL) free(g->pkeys); + if (g->skeys != NULL) free(g->skeys); + g->pkeys = NULL; + g->skeys = NULL; + + /* Turn control over to external accumulation mode. + */ + g->external = TRUE; + return 0; +} + + +/***************************************************************** + * Debugging API + *****************************************************************/ +void +SSIForceExternalSort(SSIINDEX *g) +{ + if (activate_external_sort(g) != 0) + Die("failed to turn external sorting on."); +} + + +/***************************************************************** + * Test driving mode + *****************************************************************/ +#ifdef MUGGINS_LETS_ME_SLEEP +/* Minimally: + cc -g -Wall -o shiva -DDEBUGLEVEL=1 -DMUGGINS_LETS_ME_SLEEP ssi.c sqerror.c sre_string.c types.c sre_ctype.c sre_math.c file.c -lm +*/ + +int +main(int argc, char **argv) +{ + char name[32], accession[32]; + SSIINDEX *ssi; + int mode; + SSIOFFSET r_off, d_off; + FILE *ofp; + int i; + int fh; /* a file handle */ + int status; /* return status from a SSI call */ + + mode = SSI_OFFSET_I32; + if ((ssi = SSICreateIndex(mode)) == NULL) + Die("Failed to allocate SSI index"); + + /* Generate two FASTA files, tmp.0 and tmp.1, and index them. + */ + if ((ofp = fopen("tmp.0", "w")) == NULL) + Die("failed to open tmp.0"); + if ((status = SSIAddFileToIndex(ssi, "tmp.0", SQFILE_FASTA, &fh)) != 0) + Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status)); + for (i = 0; i < 10; i++) { + if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + sprintf(name, "seq%d", i); + sprintf(accession, "ac%d", i); + fprintf(ofp, ">%s [%s] Description? we don't need no steenking description.\n", + name, accession); + if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + fprintf(ofp, "AAAAAAAAAA\n"); + fprintf(ofp, "CCCCCCCCCC\n"); + fprintf(ofp, "GGGGGGGGGG\n"); + fprintf(ofp, "TTTTTTTTTT\n"); + + if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0) + Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status)); + if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0) + Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status)); + } + SSISetFileForSubseq(ssi, fh, 11, 10); + fclose(ofp); + + if ((ofp = fopen("tmp.1", "w")) == NULL) + Die("failed to open tmp.1"); + if ((status = SSIAddFileToIndex(ssi, "tmp.1", SQFILE_FASTA, &fh)) != 0) + Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status)); + for (i = 10; i < 20; i++) { + if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + sprintf(name, "seq%d", i); + sprintf(accession, "ac%d", i); + fprintf(ofp, ">%s [%s] i/o, i/o, it's off to disk we go.\n", + name, accession); + if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + fprintf(ofp, "AAAAAAAAAA 10\n"); + fprintf(ofp, "CCCCCCCCCC 20\n"); + fprintf(ofp, "GGGGGGGGGG 30\n"); + fprintf(ofp, "TTTTTTTTTT 40\n"); + + if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0) + Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status)); + if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0) + Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status)); + } + SSISetFileForSubseq(ssi, fh, 14, 10); + fclose(ofp); + + /* Write the index to tmp.ssi + */ + if ((status = SSIWriteIndex("tmp.ssi", ssi)) != 0) + Die("SSIWriteIndex() failed: %s", SSIErrorString(status)); + SSIFreeIndex(ssi); + + /* Now reopen the index and run some tests. + */ + exit(0); +} + + +#endif /* test driving code */ + + +