Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/progs/hash_sff.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/progs/hash_sff.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,262 @@ +/* + * This adds a hash table index (".hsh" v1.00 format) to an SFF archive. + * It does this either inline on the file itself (provided it doesn't already + * have an index) or by producing a new indexed SFF archive. + * + * It has been coded to require only the memory needed to store the index + * and so does quite a lot of I/O but with minimised memory. For a 460,000 + * SFF archive it took about 22 seconds real time on a 1.7GHz P4 when copying + * or 10 seconds when updating inline. + */ + +/* ---------------------------------------------------------------------- */ + +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <io_lib/hash_table.h> +#include <io_lib/sff.h> +#include <io_lib/os.h> +#include <io_lib/mFILE.h> + +/* + * Override the sff.c functions to use FILE pointers instead. This means + * we don't have to load the entire archive into memory, which is optimal when + * dealing with a single file (ie in sff/sff.c), but not when indexing it. + * + * Done with minimal error checking I'll admit... + */ +static sff_read_header *fread_sff_read_header(FILE *fp) { + sff_read_header *h; + unsigned char rhdr[16]; + + if (16 != fread(rhdr, 1, 16, fp)) + return NULL; + h = decode_sff_read_header(rhdr); + + if (h->name_len != fread(h->name, 1, h->name_len, fp)) + return free_sff_read_header(h), NULL; + + /* Pad to 8 chars */ + fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET); + + return h; +} + +static sff_common_header *fread_sff_common_header(FILE *fp) { + sff_common_header *h; + unsigned char chdr[31]; + + if (31 != fread(chdr, 1, 31, fp)) + return NULL; + h = decode_sff_common_header(chdr); + if (h->flow_len != fread(h->flow, 1, h->flow_len, fp)) + return free_sff_common_header(h), NULL; + if (h->key_len != fread(h->key , 1, h->key_len, fp)) + return free_sff_common_header(h), NULL; + + /* Pad to 8 chars */ + fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET); + + return h; +} + +void usage(void) { + fprintf(stderr, "Usage: hash_sff [-o outfile] [-t] sff_file ...\n"); + exit(1); +} + +int main(int argc, char **argv) { + HashFile *hf; + sff_common_header *ch; + sff_read_header *rh; + int i, dot, arg; + char *sff; + char hdr[31]; + uint64_t index_offset = 0; + uint32_t index_size, index_skipped; + FILE *fp, *fpout = NULL; + int copy_archive = 1; + + + /* process command line arguments of the form -arg */ + for (argc--, argv++; argc > 0; argc--, argv++) { + if (**argv != '-' || strcmp(*argv, "--") == 0) + break; + + if (strcmp(*argv, "-o") == 0 && argc > 1) { + if (NULL == (fpout = fopen(argv[1], "wb+"))) { + perror(argv[1]); + return 1; + } + argv++; + argc--; + + } else if (strcmp(*argv, "-t") == 0) { + copy_archive = 0; + + } else if (**argv == '-') { + usage(); + } + + } + + if (argc < 1) + usage(); + + if (copy_archive == 0 && argc != 1) { + fprintf(stderr, "-t option only supported with a single sff argument\n"); + return 1; + } + + /* Create the hash table */ + hf = HashFileCreate(0, HASH_DYNAMIC_SIZE); + hf->nheaders = 0; + hf->headers = NULL; + + for (arg = 0; arg < argc; arg++) { + /* open (and read) the entire sff file */ + sff = argv[arg]; + + printf("Indexing %s:\n", sff); + if (fpout) { + if (NULL == (fp = fopen(sff, "rb"))) { + perror(sff); + return 1; + } + } else { + if (NULL == (fp = fopen(sff, "rb+"))) { + perror(sff); + return 1; + } + } + + /* Read the common header */ + ch = fread_sff_common_header(fp); + + if (ch->index_len && !fpout) { + fprintf(stderr, "Archive already contains index.\nReplacing the" + " index requires the \"-o outfile\" option.\n"); + return 1; + } + + /* Add the SFF common header as a hash file-header */ + hf->nheaders++; + hf->headers = (HashFileSection *)realloc(hf->headers, hf->nheaders * + sizeof(*hf->headers)); + hf->headers[hf->nheaders-1].pos = 0; + hf->headers[hf->nheaders-1].size = ch->header_len; + hf->headers[hf->nheaders-1].cached_data = NULL; + + /* Read the index items, adding to the hash */ + index_skipped = 0; + dot = 0; + printf(" |\r|"); + for (i = 0; i < ch->nreads; i++) { + int dlen; + uint32_t offset; + HashData hd; + HashFileItem *hfi; + + if (i >= dot * (ch->nreads/69)) { + putchar('.'); + fflush(stdout); + dot++; + } + + /* Skip old index if present */ + offset = ftell(fp); + if (offset == ch->index_offset) { + fseek(fp, ch->index_len, SEEK_CUR); + index_skipped = ch->index_len; + continue; + } + + hfi = (HashFileItem *)calloc(1, sizeof(*hfi)); + rh = fread_sff_read_header(fp); + dlen = (2*ch->flow_len + 3*rh->nbases + 7) & ~7; + fseek(fp, dlen, SEEK_CUR); + + hfi->header = hf->nheaders; + hfi->footer = 0; + hfi->pos = offset - index_skipped; + hfi->size = (ftell(fp) - index_skipped) - hfi->pos; + hd.p = hfi; + + HashTableAdd(hf->h, rh->name, rh->name_len, hd, NULL); + } + printf("\n"); + HashTableStats(hf->h, stdout); + + index_offset = ftell(fp) - index_skipped; + + /* Copy the archive if needed, minus the old index */ + if (fpout && copy_archive) { + char block[8192]; + size_t len; + uint64_t pos = 0; + + printf("\nCopying archive\n"); + + fseek(fp, 0, SEEK_SET); + while (len = fread(block, 1, 8192, fp)) { + /* Skip previous index */ + if (pos < ch->index_offset && pos+len > ch->index_offset) { + len = ch->index_offset - pos; + fseek(fp, ch->index_offset + ch->index_len, SEEK_SET); + } + if (len && len != fwrite(block, 1, len, fpout)) { + fprintf(stderr, "Failed to output new archive\n"); + return 1; + } + pos += len; + } + } + + if (!fpout) { + /* Save the hash */ + printf("Saving index\n"); + fseek(fp, 0, SEEK_END); + index_size = HashFileSave(hf, fp, 0); + HashFileDestroy(hf); + + /* Update the common header */ + fseek(fp, 0, SEEK_SET); + fread(hdr, 1, 31, fp); + *(uint64_t *)(hdr+8) = be_int8(index_offset); + *(uint32_t *)(hdr+16) = be_int4(index_size); + fseek(fp, 0, SEEK_SET); + fwrite(hdr, 1, 31, fp); + } + + fclose(fp); + } + + if (fpout) { + /* Save the hash */ + printf("Saving index\n"); + + if (!copy_archive) { + hf->archive = strdup(argv[0]); + index_offset = 0; + } + + fseek(fpout, 0, SEEK_END); + index_size = HashFileSave(hf, fpout, 0); + HashFileDestroy(hf); + + /* Update the common header to indicate index location */ + if (copy_archive) { + fseek(fpout, 0, SEEK_SET); + fread(hdr, 1, 31, fpout); + *(uint64_t *)(hdr+8) = be_int8(index_offset); + *(uint32_t *)(hdr+16) = be_int4(index_size); + fseek(fpout, 0, SEEK_SET); + fwrite(hdr, 1, 31, fpout); + } + fclose(fpout); + } + + return 0; +}