diff ezBAMQC/src/htslib/cram/cram_index.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ezBAMQC/src/htslib/cram/cram_index.c	Thu Mar 24 17:12:52 2016 -0400
@@ -0,0 +1,557 @@
+/*
+Copyright (c) 2013-2014 Genome Research Ltd.
+Author: James Bonfield <jkb@sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * The index is a gzipped tab-delimited text file with one line per slice.
+ * The columns are:
+ * 1: reference number (0 to N-1, as per BAM ref_id)
+ * 2: reference position of 1st read in slice (1..?)
+ * 3: number of reads in slice
+ * 4: offset of container start (relative to end of SAM header, so 1st
+ *    container is offset 0).
+ * 5: slice number within container (ie which landmark).
+ *
+ * In memory, we hold this in a nested containment list. Each list element is
+ * a cram_index struct. Each element in turn can contain its own list of
+ * cram_index structs.
+ *
+ * Any start..end range which is entirely contained within another (and
+ * earlier as it is sorted) range will be held within it. This ensures that
+ * the outer list will never have containments and we can safely do a
+ * binary search to find the first range which overlaps any given coordinate.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "io_lib_config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "htslib/hfile.h"
+#include "cram/cram.h"
+#include "cram/os.h"
+#include "cram/zfio.h"
+
+#if 0
+static void dump_index_(cram_index *e, int level) {
+    int i, n;
+    n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
+    printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
+    for (i = 0; i < e->nslice; i++) {
+	dump_index_(&e->e[i], level+1);
+    }
+}
+
+static void dump_index(cram_fd *fd) {
+    int i;
+    for (i = 0; i < fd->index_sz; i++) {
+	dump_index_(&fd->index[i], 0);
+    }
+}
+#endif
+
+static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
+    int sign = 1;
+    int32_t val = 0;
+    size_t p = *pos;
+
+    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
+	   p++;
+
+    if (p < k->l && k->s[p] == '-')
+	sign = -1, p++;
+
+    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
+	return -1;
+
+    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
+	val = val*10 + k->s[p++]-'0';
+    
+    *pos = p;
+    *val_p = sign*val;
+
+    return 0;
+}
+
+static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
+    int sign = 1;
+    int64_t val = 0;
+    size_t p = *pos;
+
+    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
+	   p++;
+
+    if (p < k->l && k->s[p] == '-')
+	sign = -1, p++;
+
+    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
+	return -1;
+
+    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
+	val = val*10 + k->s[p++]-'0';
+    
+    *pos = p;
+    *val_p = sign*val;
+
+    return 0;
+}
+
+/*
+ * Loads a CRAM .crai index into memory.
+ *
+ * Returns 0 for success
+ *        -1 for failure
+ */
+int cram_index_load(cram_fd *fd, const char *fn) {
+    char fn2[PATH_MAX];
+    char buf[65536];
+    ssize_t len;
+    kstring_t kstr = {0};
+    hFILE *fp;
+    cram_index *idx;
+    cram_index **idx_stack = NULL, *ep, e;
+    int idx_stack_alloc = 0, idx_stack_ptr = 0;
+    size_t pos = 0;
+
+    /* Check if already loaded */
+    if (fd->index)
+	return 0;
+
+    fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
+    if (!fd->index)
+	return -1;
+
+    idx = &fd->index[0];
+    idx->refid = -1;
+    idx->start = INT_MIN;
+    idx->end   = INT_MAX;
+
+    idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
+    idx_stack[idx_stack_ptr] = idx;
+
+    sprintf(fn2, "%s.crai", fn);
+    if (!(fp = hopen(fn2, "r"))) {
+	perror(fn2);
+	free(idx_stack);
+	return -1; 
+    }
+
+    // Load the file into memory
+    while ((len = hread(fp, buf, 65536)) > 0)
+	kputsn(buf, len, &kstr);
+    if (len < 0 || kstr.l < 2) {
+	if (kstr.s)
+	    free(kstr.s);
+	free(idx_stack);
+	return -1;
+    }
+
+    if (hclose(fp)) {
+	if (kstr.s)
+	    free(kstr.s);
+	free(idx_stack);
+	return -1;
+    }
+	
+
+    // Uncompress if required
+    if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
+	size_t l;
+	char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
+	free(kstr.s);
+	if (!s) {
+	    free(idx_stack);
+	    return -1;
+	}
+	kstr.s = s;
+	kstr.l = l;
+	kstr.m = l; // conservative estimate of the size allocated
+	kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
+    }
+
+
+    // Parse it line at a time
+    do {
+	/* 1.1 layout */
+	if (kget_int32(&kstr, &pos, &e.refid) == -1) {
+	    free(kstr.s); free(idx_stack); return -1;
+	}
+	if (kget_int32(&kstr, &pos, &e.start) == -1) {
+	    free(kstr.s); free(idx_stack); return -1;
+	}
+	if (kget_int32(&kstr, &pos, &e.end) == -1) {
+	    free(kstr.s); free(idx_stack); return -1;
+	}
+	if (kget_int64(&kstr, &pos, &e.offset) == -1) {
+	    free(kstr.s); free(idx_stack); return -1;
+	}
+	if (kget_int32(&kstr, &pos, &e.slice) == -1) {
+	    free(kstr.s); free(idx_stack); return -1;
+	}
+	if (kget_int32(&kstr, &pos, &e.len) == -1) {
+	    free(kstr.s); free(idx_stack); return -1;
+	}
+
+	e.end += e.start-1;
+	//printf("%d/%d..%d\n", e.refid, e.start, e.end);
+
+	if (e.refid < -1) {
+	    free(kstr.s);
+	    free(idx_stack);
+	    fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
+	    return -1;
+	}
+
+	if (e.refid != idx->refid) {
+	    if (fd->index_sz < e.refid+2) {
+		size_t index_end = fd->index_sz * sizeof(*fd->index);
+		fd->index_sz = e.refid+2;
+		fd->index = realloc(fd->index,
+				    fd->index_sz * sizeof(*fd->index));
+		memset(((char *)fd->index) + index_end, 0,
+		       fd->index_sz * sizeof(*fd->index) - index_end);
+	    }
+	    idx = &fd->index[e.refid+1];
+	    idx->refid = e.refid;
+	    idx->start = INT_MIN;
+	    idx->end   = INT_MAX;
+	    idx->nslice = idx->nalloc = 0;
+	    idx->e = NULL;
+	    idx_stack[(idx_stack_ptr = 0)] = idx;
+	}
+
+	while (!(e.start >= idx->start && e.end <= idx->end)) {
+	    idx = idx_stack[--idx_stack_ptr];
+	}
+
+	// Now contains, so append
+	if (idx->nslice+1 >= idx->nalloc) {
+	    idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
+	    idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
+	}
+
+	e.nalloc = e.nslice = 0; e.e = NULL;
+	*(ep = &idx->e[idx->nslice++]) = e;
+	idx = ep;
+
+	if (++idx_stack_ptr >= idx_stack_alloc) {
+	    idx_stack_alloc *= 2;
+	    idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
+	}
+	idx_stack[idx_stack_ptr] = idx;
+
+	while (pos < kstr.l && kstr.s[pos] != '\n')
+	    pos++;
+	pos++;
+    } while (pos < kstr.l);
+
+    free(idx_stack);
+    free(kstr.s);
+
+    // dump_index(fd);
+
+    return 0;
+}
+
+static void cram_index_free_recurse(cram_index *e) {
+    if (e->e) {
+	int i;
+	for (i = 0; i < e->nslice; i++) {
+	    cram_index_free_recurse(&e->e[i]);
+	}
+	free(e->e);
+    }
+}
+
+void cram_index_free(cram_fd *fd) {
+    int i;
+
+    if (!fd->index)
+	return;
+    
+    for (i = 0; i < fd->index_sz; i++) {
+	cram_index_free_recurse(&fd->index[i]);
+    }
+    free(fd->index);
+
+    fd->index = NULL;
+}
+
+/*
+ * Searches the index for the first slice overlapping a reference ID
+ * and position, or one immediately preceding it if none is found in
+ * the index to overlap this position. (Our index may have missing
+ * entries, but we require at least one per reference.)
+ *
+ * If the index finds multiple slices overlapping this position we
+ * return the first one only. Subsequent calls should specifying
+ * "from" as the last slice we checked to find the next one. Otherwise
+ * set "from" to be NULL to find the first one.
+ *
+ * Returns the cram_index pointer on sucess
+ *         NULL on failure
+ */
+cram_index *cram_index_query(cram_fd *fd, int refid, int pos, 
+			     cram_index *from) {
+    int i, j, k;
+    cram_index *e;
+
+    if (refid+1 < 0 || refid+1 >= fd->index_sz)
+	return NULL;
+
+    i = 0, j = fd->index[refid+1].nslice-1;
+
+    if (!from)
+	from = &fd->index[refid+1];
+
+    for (k = j/2; k != i; k = (j-i)/2 + i) {
+	if (from->e[k].refid > refid) {
+	    j = k;
+	    continue;
+	}
+
+	if (from->e[k].refid < refid) {
+	    i = k;
+	    continue;
+	}
+
+	if (from->e[k].start >= pos) {
+	    j = k;
+	    continue;
+	}
+
+	if (from->e[k].start < pos) {
+	    i = k;
+	    continue;
+	}
+    }
+    // i==j or i==j-1. Check if j is better.
+    if (from->e[j].start < pos && from->e[j].refid == refid)
+	i = j;
+
+    /* The above found *a* bin overlapping, but not necessarily the first */
+    while (i > 0 && from->e[i-1].end >= pos)
+	i--;
+
+    /* Special case for matching a start pos */
+    if (i+1 < from->nslice &&
+	from->e[i+1].start == pos &&
+	from->e[i+1].refid == refid)
+	i++;
+
+    e = &from->e[i];
+
+    return e;
+}
+
+
+/*
+ * Skips to a container overlapping the start coordinate listed in
+ * cram_range.
+ *
+ * In theory we call cram_index_query multiple times, once per slice
+ * overlapping the range. However slices may be absent from the index
+ * which makes this problematic. Instead we find the left-most slice
+ * and then read from then on, skipping decoding of slices and/or
+ * whole containers when they don't overlap the specified cram_range.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
+    cram_index *e;
+
+    // Ideally use an index, so see if we have one.
+    if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
+	if (0 != cram_seek(fd, e->offset, SEEK_SET))
+	    if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
+		return -1;
+    } else {
+	fprintf(stderr, "Unknown reference ID. Missing from index?\n");
+	return -1;
+    }
+
+    if (fd->ctr) {
+	cram_free_container(fd->ctr);
+	fd->ctr = NULL;
+	fd->ooc = 0;
+    }
+
+    return 0;
+}
+
+
+/*
+ * A specialised form of cram_index_build (below) that deals with slices
+ * having multiple references in this (ref_id -2). In this scenario we
+ * decode the slice to look at the RI data series instead.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+static int cram_index_build_multiref(cram_fd *fd,
+				     cram_container *c,
+				     cram_slice *s,
+				     zfp *fp,
+				     off_t cpos,
+				     int32_t landmark,
+				     int sz) {
+    int i, ref = -2, ref_start = 0, ref_end;
+    char buf[1024];
+
+    if (0 != cram_decode_slice(fd, c, s, fd->header))
+	return -1;
+
+    ref_end = INT_MIN;
+    for (i = 0; i < s->hdr->num_records; i++) {
+	if (s->crecs[i].ref_id == ref) {
+	    if (ref_end < s->crecs[i].aend)
+		ref_end = s->crecs[i].aend;
+	    continue;
+	}
+
+	if (ref != -2) {
+	    sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
+		    ref, ref_start, ref_end - ref_start + 1,
+		    (int64_t)cpos, landmark, sz);
+	    zfputs(buf, fp);
+	}
+
+	ref = s->crecs[i].ref_id;
+	ref_start = s->crecs[i].apos;
+	ref_end = INT_MIN;
+    }
+
+    if (ref != -2) {
+	sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
+		ref, ref_start, ref_end - ref_start + 1,
+		(int64_t)cpos, landmark, sz);
+	zfputs(buf, fp);
+    }
+
+    return 0;
+}
+
+/*
+ * Builds an index file.
+ *
+ * fd is a newly opened cram file that we wish to index.
+ * fn_base is the filename of the associated CRAM file. Internally we
+ * add ".crai" to this to get the index filename.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int cram_index_build(cram_fd *fd, const char *fn_base) {
+    cram_container *c;
+    off_t cpos, spos, hpos;
+    zfp *fp;
+    char fn_idx[PATH_MAX];
+
+    if (strlen(fn_base) > PATH_MAX-6)
+	return -1;
+
+    sprintf(fn_idx, "%s.crai", fn_base);
+    if (!(fp = zfopen(fn_idx, "wz"))) {
+        perror(fn_idx);
+        return -1;
+    }
+
+    cpos = htell(fd->fp);
+    while ((c = cram_read_container(fd))) {
+        int j;
+
+        if (fd->err) {
+            perror("Cram container read");
+            return 1;
+        }
+
+        hpos = htell(fd->fp);
+
+        if (!(c->comp_hdr_block = cram_read_block(fd)))
+            return 1;
+        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
+
+        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
+        if (!c->comp_hdr)
+            return -1;
+
+        // 2.0 format
+        for (j = 0; j < c->num_landmarks; j++) {
+            char buf[1024];
+            cram_slice *s;
+            int sz;
+
+            spos = htell(fd->fp);
+            assert(spos - cpos - c->offset == c->landmark[j]);
+
+            if (!(s = cram_read_slice(fd))) {
+		zfclose(fp);
+		return -1;
+	    }
+
+            sz = (int)(htell(fd->fp) - spos);
+
+	    if (s->hdr->ref_seq_id == -2) {
+		cram_index_build_multiref(fd, c, s, fp,
+					  cpos, c->landmark[j], sz);
+	    } else {
+		sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
+			s->hdr->ref_seq_id, s->hdr->ref_seq_start,
+			s->hdr->ref_seq_span, (int64_t)cpos,
+			c->landmark[j], sz);
+		zfputs(buf, fp);
+	    }
+
+            cram_free_slice(s);
+        }
+
+        cpos = htell(fd->fp);
+        assert(cpos == hpos + c->length);
+
+        cram_free_container(c);
+    }
+    if (fd->err) {
+	zfclose(fp);
+	return -1;
+    }
+	
+
+    return zfclose(fp);
+}