diff srf2fastq/io_lib-1.12.2/progs/hash_sff.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/srf2fastq/io_lib-1.12.2/progs/hash_sff.c	Tue Jun 07 17:48:05 2011 -0400
@@ -0,0 +1,262 @@
+/*
+ * This adds a hash table index (".hsh" v1.00 format) to an SFF archive.
+ * It does this either inline on the file itself (provided it doesn't already
+ * have an index) or by producing a new indexed SFF archive.
+ *
+ * It has been coded to require only the memory needed to store the index
+ * and so does quite a lot of I/O but with minimised memory. For a 460,000
+ * SFF archive it took about 22 seconds real time on a 1.7GHz P4 when copying
+ * or 10 seconds when updating inline.
+ */
+
+/* ---------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <io_lib/hash_table.h>
+#include <io_lib/sff.h>
+#include <io_lib/os.h>
+#include <io_lib/mFILE.h>
+
+/*
+ * Override the sff.c functions to use FILE pointers instead. This means
+ * we don't have to load the entire archive into memory, which is optimal when
+ * dealing with a single file (ie in sff/sff.c), but not when indexing it.
+ *
+ * Done with minimal error checking I'll admit...
+ */
+static sff_read_header *fread_sff_read_header(FILE *fp) {
+    sff_read_header *h;
+    unsigned char rhdr[16];
+
+    if (16 != fread(rhdr, 1, 16, fp))
+	return NULL;
+    h = decode_sff_read_header(rhdr);
+
+    if (h->name_len != fread(h->name, 1, h->name_len, fp))
+	return free_sff_read_header(h), NULL;
+    
+    /* Pad to 8 chars */
+    fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET);
+
+    return h;
+}
+
+static sff_common_header *fread_sff_common_header(FILE *fp) {
+    sff_common_header *h;
+    unsigned char chdr[31];
+
+    if (31 != fread(chdr, 1, 31, fp))
+	return NULL;
+    h = decode_sff_common_header(chdr);
+    if (h->flow_len != fread(h->flow, 1, h->flow_len, fp))
+	return free_sff_common_header(h), NULL;
+    if (h->key_len != fread(h->key , 1, h->key_len,  fp))
+	return free_sff_common_header(h), NULL;
+
+    /* Pad to 8 chars */
+    fseek(fp, (ftell(fp) + 7)& ~7, SEEK_SET);
+
+    return h;
+}
+
+void usage(void) {
+    fprintf(stderr, "Usage: hash_sff [-o outfile] [-t] sff_file ...\n");
+    exit(1);
+}
+
+int main(int argc, char **argv) {
+    HashFile *hf;
+    sff_common_header *ch;
+    sff_read_header *rh;
+    int i, dot, arg;
+    char *sff;
+    char hdr[31];
+    uint64_t index_offset = 0;
+    uint32_t index_size, index_skipped;
+    FILE *fp, *fpout = NULL;
+    int copy_archive = 1;
+    
+
+    /* process command line arguments of the form -arg */
+    for (argc--, argv++; argc > 0; argc--, argv++) {
+	if (**argv != '-' || strcmp(*argv, "--") == 0)
+	    break;
+
+	if (strcmp(*argv, "-o") == 0 && argc > 1) {
+	    if (NULL == (fpout = fopen(argv[1], "wb+"))) {
+		perror(argv[1]);
+		return 1;
+	    }
+	    argv++;
+	    argc--;
+
+	} else if (strcmp(*argv, "-t") == 0) {
+	    copy_archive = 0;
+
+	} else if (**argv == '-') {
+	    usage();
+	}
+
+    }
+
+    if (argc < 1)
+	usage();
+
+    if (copy_archive == 0 && argc != 1) {
+	fprintf(stderr, "-t option only supported with a single sff argument\n");
+	return 1;
+    }
+
+    /* Create the hash table */
+    hf = HashFileCreate(0, HASH_DYNAMIC_SIZE);
+    hf->nheaders = 0;
+    hf->headers = NULL;
+
+    for (arg = 0; arg < argc; arg++) {
+	/* open (and read) the entire sff file */
+	sff = argv[arg];
+
+	printf("Indexing %s:\n", sff);
+	if (fpout) {
+	    if (NULL == (fp = fopen(sff, "rb"))) {
+		perror(sff);
+		return 1;
+	    }
+	} else { 
+	    if (NULL == (fp = fopen(sff, "rb+"))) {
+		perror(sff);
+		return 1;
+	    }
+	}
+
+	/* Read the common header */
+	ch = fread_sff_common_header(fp);
+
+	if (ch->index_len && !fpout) {
+	    fprintf(stderr, "Archive already contains index.\nReplacing the"
+		    " index requires the \"-o outfile\" option.\n");
+	    return 1;
+	}
+
+	/* Add the SFF common header as a hash file-header */
+	hf->nheaders++;
+	hf->headers = (HashFileSection *)realloc(hf->headers, hf->nheaders *
+						 sizeof(*hf->headers));
+	hf->headers[hf->nheaders-1].pos = 0;
+	hf->headers[hf->nheaders-1].size = ch->header_len;
+	hf->headers[hf->nheaders-1].cached_data = NULL;
+
+	/* Read the index items, adding to the hash */
+	index_skipped = 0;
+	dot = 0;
+	printf("                                                                       |\r|");
+	for (i = 0; i < ch->nreads; i++) {
+	    int dlen;
+	    uint32_t offset;
+	    HashData hd;
+	    HashFileItem *hfi;
+	    
+	    if (i >= dot * (ch->nreads/69)) {
+		putchar('.');
+		fflush(stdout);
+		dot++;
+	    }
+
+	    /* Skip old index if present */
+	    offset = ftell(fp);
+	    if (offset == ch->index_offset) {
+		fseek(fp, ch->index_len, SEEK_CUR);
+		index_skipped = ch->index_len;
+		continue;
+	    }
+
+	    hfi = (HashFileItem *)calloc(1, sizeof(*hfi));
+	    rh = fread_sff_read_header(fp);
+	    dlen = (2*ch->flow_len + 3*rh->nbases + 7) & ~7;
+	    fseek(fp, dlen, SEEK_CUR);
+	
+	    hfi->header = hf->nheaders;
+	    hfi->footer = 0;
+	    hfi->pos = offset - index_skipped;
+	    hfi->size = (ftell(fp) - index_skipped) - hfi->pos;
+	    hd.p = hfi;
+
+	    HashTableAdd(hf->h, rh->name, rh->name_len, hd, NULL);
+	}
+	printf("\n");
+	HashTableStats(hf->h, stdout);
+
+	index_offset = ftell(fp) - index_skipped;
+
+	/* Copy the archive if needed, minus the old index */
+	if (fpout && copy_archive) {
+	    char block[8192];
+	    size_t len;
+	    uint64_t pos = 0;
+
+	    printf("\nCopying archive\n");
+
+	    fseek(fp, 0, SEEK_SET);
+	    while (len = fread(block, 1, 8192, fp)) {
+		/* Skip previous index */
+		if (pos < ch->index_offset && pos+len > ch->index_offset) {
+		    len = ch->index_offset - pos;
+		    fseek(fp, ch->index_offset + ch->index_len, SEEK_SET);
+		}
+		if (len && len != fwrite(block, 1, len, fpout)) {
+		    fprintf(stderr, "Failed to output new archive\n");
+		    return 1;
+		}
+		pos += len;
+	    }
+	}
+	
+	if (!fpout) {
+	    /* Save the hash */
+	    printf("Saving index\n");
+	    fseek(fp, 0, SEEK_END);
+	    index_size = HashFileSave(hf, fp, 0);
+	    HashFileDestroy(hf);
+
+	    /* Update the common header */
+	    fseek(fp, 0, SEEK_SET);
+	    fread(hdr, 1, 31, fp);
+	    *(uint64_t *)(hdr+8)  = be_int8(index_offset);
+	    *(uint32_t *)(hdr+16) = be_int4(index_size);
+	    fseek(fp, 0, SEEK_SET);
+	    fwrite(hdr, 1, 31, fp);
+	}
+
+	fclose(fp);
+    }
+
+    if (fpout) {
+	/* Save the hash */
+	printf("Saving index\n");
+
+	if (!copy_archive) {
+	    hf->archive = strdup(argv[0]);
+	    index_offset = 0;
+	}
+
+	fseek(fpout, 0, SEEK_END);
+	index_size = HashFileSave(hf, fpout, 0);
+	HashFileDestroy(hf);
+
+	/* Update the common header to indicate index location */
+	if (copy_archive) {
+	    fseek(fpout, 0, SEEK_SET);
+	    fread(hdr, 1, 31, fpout);
+	    *(uint64_t *)(hdr+8)  = be_int8(index_offset);
+	    *(uint32_t *)(hdr+16) = be_int4(index_size);
+	    fseek(fpout, 0, SEEK_SET);
+	    fwrite(hdr, 1, 31, fpout);
+	}
+	fclose(fpout);
+    }
+    
+    return 0;
+}