diff ezBAMQC/src/htslib/cram/cram_stats.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ezBAMQC/src/htslib/cram/cram_stats.c	Thu Mar 24 17:12:52 2016 -0400
@@ -0,0 +1,450 @@
+/*
+Copyright (c) 2012-2013 Genome Research Ltd.
+Author: James Bonfield <jkb@sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "io_lib_config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "cram/cram.h"
+#include "cram/os.h"
+
+cram_stats *cram_stats_create(void) {
+    return calloc(1, sizeof(cram_stats));
+}
+
+void cram_stats_add(cram_stats *st, int32_t val) {
+    st->nsamp++;
+
+    //assert(val >= 0);
+
+    if (val < MAX_STAT_VAL && val >= 0) {
+	st->freqs[val]++;
+    } else {
+	khint_t k;
+	int r;
+
+	if (!st->h) {
+	    st->h = kh_init(m_i2i);
+	}
+
+	k = kh_put(m_i2i, st->h, val, &r);
+	if (r == 0)
+	    kh_val(st->h, k)++;
+	else if (r != -1)
+	    kh_val(st->h, k) = 1;
+	else
+	    ; // FIXME: handle error
+    }
+}
+
+void cram_stats_del(cram_stats *st, int32_t val) {
+    st->nsamp--;
+
+    //assert(val >= 0);
+
+    if (val < MAX_STAT_VAL && val >= 0) {
+	st->freqs[val]--;
+	assert(st->freqs[val] >= 0);
+    } else if (st->h) {
+	khint_t k = kh_get(m_i2i, st->h, val);
+
+	if (k != kh_end(st->h)) {
+	    if (--kh_val(st->h, k) == 0)
+		kh_del(m_i2i, st->h, k);
+	} else {
+	    fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
+	    st->nsamp++;
+	}
+    } else {
+	fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
+	st->nsamp++;
+    }
+}
+
+void cram_stats_dump(cram_stats *st) {
+    int i;
+    fprintf(stderr, "cram_stats:\n");
+    for (i = 0; i < MAX_STAT_VAL; i++) {
+	if (!st->freqs[i])
+	    continue;
+	fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
+    }
+    if (st->h) {
+	khint_t k;
+	for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
+	    if (!kh_exist(st->h, k))
+		continue;
+
+	    fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
+	}
+    }
+}
+
+#if 1
+/* Returns the number of bits set in val; it the highest bit used */
+static int nbits(int v) {
+    static const int MultiplyDeBruijnBitPosition[32] = {
+	1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
+	9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
+    };
+
+    v |= v >> 1; // first up to set all bits 1 after the first 1 */
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+
+    // DeBruijn magic to find top bit
+    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+}
+#endif
+
+/*
+ * Computes entropy from integer frequencies for various encoding methods and
+ * picks the best encoding.
+ *
+ * FIXME: we could reuse some of the code here for the actual encoding
+ * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
+ *
+ * Returns the best codec to use.
+ */
+enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
+    enum cram_encoding best_encoding = E_NULL;
+    int best_size = INT_MAX, bits;
+    int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
+    int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes;
+
+    //cram_stats_dump(st);
+
+    /* Count number of unique symbols */
+    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
+	if (!st->freqs[i])
+	    continue;
+	if (nvals >= vals_alloc) {
+	    vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
+	    vals  = realloc(vals,  vals_alloc * sizeof(int));
+	    freqs = realloc(freqs, vals_alloc * sizeof(int));
+	    if (!vals || !freqs) {
+		if (vals)  free(vals);
+		if (freqs) free(freqs);
+		return E_HUFFMAN; // Cannot do much else atm
+	    }
+	}
+	vals[nvals] = i;
+	freqs[nvals] = st->freqs[i];
+	ntot += freqs[nvals];
+	if (max_val < i) max_val = i;
+	if (min_val > i) min_val = i;
+	nvals++;
+    }
+    if (st->h) {
+	khint_t k;
+	int i;
+
+	for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
+	    if (!kh_exist(st->h, k))
+		continue;
+
+	    if (nvals >= vals_alloc) {
+		vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
+		vals  = realloc(vals,  vals_alloc * sizeof(int));
+		freqs = realloc(freqs, vals_alloc * sizeof(int));
+		if (!vals || !freqs)
+		    return E_HUFFMAN; // Cannot do much else atm
+	    }
+	    i = kh_key(st->h, k);
+	    vals[nvals]=i;
+	    freqs[nvals] = kh_val(st->h, k);
+	    ntot += freqs[nvals];
+	    if (max_val < i) max_val = i;
+	    if (min_val > i) min_val = i;
+	    nvals++;
+	}
+    }
+
+    st->nvals = nvals;
+    assert(ntot == st->nsamp);
+
+    if (nvals <= 1) {
+	free(vals);
+	free(freqs);
+	return E_HUFFMAN;
+    }
+
+    if (fd->verbose > 1)
+	fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
+		min_val, max_val, nvals, ntot);
+
+    /* Theoretical entropy */
+//    if (fd->verbose > 1) {
+//	double dbits = 0;
+//	for (i = 0; i < nvals; i++) {
+//	    dbits += freqs[i] * log((double)freqs[i]/ntot);
+//	}
+//	dbits /= -log(2);
+//	if (fd->verbose > 1)
+//	    fprintf(stderr, "Entropy = %f\n", dbits);
+//    }
+
+    if (nvals > 1 && ntot > 256) {
+#if 0
+	/*
+	 * CRUDE huffman estimator. Round to closest and round up from 0
+	 * to 1 bit.
+	 *
+	 * With and without ITF8 incase we have a few discrete values but with
+	 * large magnitude.
+	 *
+	 * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be
+	 * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes
+	 * may detect the correlation of high bytes to low bytes in multi-
+	 * byte values. So this predictor breaks down.
+	 */
+	double dbits = 0;  // entropy + ~huffman
+	double dbitsH = 0;
+	double dbitsE = 0; // external entropy + ~huffman
+	double dbitsEH = 0;
+	int F[256] = {0}, n = 0;
+	double e = 0; // accumulated error bits
+	for (i = 0; i < nvals; i++) {
+	    double x; int X;
+	    unsigned int v = vals[i];
+
+	    //Better encoding would cope with sign.
+	    //v = ABS(vals[i])*2+(vals[i]<0);
+
+	    if (!(v & ~0x7f)) {
+		F[v]             += freqs[i], n+=freqs[i];
+	    } else if (!(v & ~0x3fff)) {
+		F[(v>>8) |0x80] += freqs[i];
+		F[ v     &0xff] += freqs[i], n+=2*freqs[i];
+	    } else if (!(v & ~0x1fffff)) {
+		F[(v>>16)|0xc0] += freqs[i];
+		F[(v>>8 )&0xff] += freqs[i];
+		F[ v     &0xff] += freqs[i], n+=3*freqs[i];
+	    } else if (!(v & ~0x0fffffff)) {
+		F[(v>>24)|0xe0] += freqs[i];
+		F[(v>>16)&0xff] += freqs[i];
+		F[(v>>8 )&0xff] += freqs[i];
+		F[ v     &0xff] += freqs[i], n+=4*freqs[i];
+	    } else {
+		F[(v>>28)|0xf0] += freqs[i];
+		F[(v>>20)&0xff] += freqs[i];
+		F[(v>>12)&0xff] += freqs[i];
+		F[(v>>4 )&0xff] += freqs[i];
+		F[ v     &0x0f] += freqs[i], n+=5*freqs[i];
+	    }
+
+	    x = -log((double)freqs[i]/ntot)/.69314718055994530941;
+	    X = x+0.5;
+	    if ((int)(x+((double)e/freqs[i])+.5)>X) {
+		X++;
+	    } else if ((int)(x+((double)e/freqs[i])+.5)<X) {
+		X--;
+	    }
+	    e-=freqs[i]*(X-x);
+	    X += (X==0);
+
+	    //fprintf(stderr, "Val %d = %d x %d (ent %f, %d) e %f\n", i, v, freqs[i], x, X, e);
+
+	    dbits  += freqs[i] * x;
+	    dbitsH += freqs[i] * X;
+	}
+
+	for (i = 0; i < 256; i++) {
+	    if (F[i]) {
+		double x = -log((double)F[i]/n)/.69314718055994530941;
+		int X = x+0.5;
+		X += (X==0);
+		dbitsE  += F[i] * x;
+		dbitsEH += F[i] * X;
+
+		//fprintf(stderr, "Val %d = %d x %d (e %f, %d)\n", i, i, F[i], x, X);
+	    }
+	}
+
+	//fprintf(stderr, "CORE Entropy = %f, %f\n", dbits/8, dbitsH/8);
+	//fprintf(stderr, "Ext. Entropy = %f, %f\n", dbitsE/8, dbitsEH/8);
+
+	if (dbitsE < 1000 || dbitsE / dbits > 1.1) {
+	    //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals);
+	    free(vals); free(freqs);
+	    return nvals < 200 ? E_HUFFMAN : E_BETA;
+	}
+#endif
+	free(vals); free(freqs);
+	return E_EXTERNAL;
+    }
+
+    /*
+     * Avoid complex stats for now, just do heuristic of HUFFMAN for small
+     * alphabets and BETA for anything large.
+     */
+    free(vals); free(freqs);
+    return nvals < 200 ? E_HUFFMAN : E_BETA;
+    //return E_HUFFMAN;
+    //return E_EXTERNAL;
+
+
+    /* We only support huffman now anyway... */
+    //free(vals); free(freqs); return E_HUFFMAN;
+
+    /* Beta */
+    bits = nbits(max_val - min_val) * ntot;
+    if (fd->verbose > 1)
+	fprintf(stderr, "BETA    = %d\n", bits);
+    if (best_size > bits)
+	best_size = bits, best_encoding = E_BETA;
+
+#if 0
+    /* Unary */
+    if (min_val >= 0) {
+	for (bits = i = 0; i < nvals; i++)
+	    bits += freqs[i]*(vals[i]+1);
+	if (fd->verbose > 1)
+	    fprintf(stderr, "UNARY   = %d\n", bits);
+	if (best_size > bits)
+	    best_size = bits, best_encoding = E_NULL; //E_UNARY;
+    }
+
+    /* Gamma */
+    for (bits = i = 0; i < nvals; i++)
+	bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i];
+    if (fd->verbose > 1)
+	fprintf(stderr, "GAMMA   = %d\n", bits);
+    if (best_size > bits)
+	best_size = bits, best_encoding = E_GAMMA;
+
+    /* Subexponential */
+    for (k = 0; k < 10; k++) {
+	for (bits = i = 0; i < nvals; i++) {
+	    if (vals[i]-min_val < (1<<k))
+		bits += (1 + k)*freqs[i];
+	    else
+		bits += (nbits(vals[i]-min_val)*2-k)*freqs[i];
+	}
+
+	if (fd->verbose > 1)
+	    fprintf(stderr, "SUBEXP%d = %d\n", k, bits);
+	if (best_size > bits)
+	    best_size = bits, best_encoding = E_SUBEXP;
+    }
+#endif
+
+    /* byte array len */
+
+    /* byte array stop */
+
+    /* External? Guesswork! */
+
+    /* Huffman */
+//    qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs);
+//    for (i = 0; i < nvals; i++) {
+//	fprintf(stderr, "%d = %d\n", i, freqs[i]);
+//	vals[i] = 0;
+//    }
+
+    /* Grow freqs to 2*freqs, to store sums */
+    /* Vals holds link data */
+    freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
+    codes = calloc(2*nvals, sizeof(*codes));
+    if (!freqs || !codes)
+	return E_HUFFMAN; // Cannot do much else atm
+
+    /* Inefficient, use pointers to form chain so we can insert and maintain
+     * a sorted list? This is currently O(nvals^2) complexity.
+     */
+    for (;;) {
+	int low1 = INT_MAX, low2 = INT_MAX;
+	int ind1 = 0, ind2 = 0;
+	for (i = 0; i < nvals; i++) {
+	    if (freqs[i] < 0)
+		continue;
+	    if (low1 > freqs[i]) 
+		low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
+	    else if (low2 > freqs[i])
+		low2 = freqs[i], ind2 = i;
+	}
+	if (low2 == INT_MAX)
+	    break;
+
+	//fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n",
+	//	ind1, vals[ind1], ind2, vals[ind2], low1, low2,
+	//	nvals, low1+low2);
+
+	freqs[nvals] = low1 + low2;
+	codes[ind1] = nvals;
+	codes[ind2] = nvals;
+	freqs[ind1] *= -1;
+	freqs[ind2] *= -1;
+	nvals++;
+    }
+    nvals = nvals/2+1;
+
+    for (i = 0; i < nvals; i++) {
+	int code_len = 0;
+	for (k = codes[i]; k; k = codes[k])
+	    code_len++;
+	codes[i] = code_len;
+	freqs[i] *= -1;
+	//fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]);
+    }
+
+    for (bits = i = 0; i < nvals; i++) {
+	bits += freqs[i] * codes[i];
+    }
+    if (fd->verbose > 1)
+	fprintf(stderr, "HUFFMAN = %d\n", bits);
+    if (best_size >= bits)
+	best_size = bits, best_encoding = E_HUFFMAN;
+    free(codes);
+
+    free(vals);
+    free(freqs);
+
+    return best_encoding;
+}
+
+void cram_stats_free(cram_stats *st) {
+    if (st->h)
+	kh_destroy(m_i2i, st->h);
+    free(st);
+}