Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/io_lib/deflate_interlaced.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/io_lib/deflate_interlaced.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,2282 @@ +/* + * This code implements "interlaced deflate", and is *based* on a + * simplistic implementation of the Deflate algorithm. + * See http://www.ietf.org/rfc/rfc1951.txt for details on the original + * Deflate algorithm. + * + * It differs from RFC1951 in two important ways: + * + * 1) It only supports the huffman encoding step and does not attempt + * to do any LZ-style string matching to generate distance codes. + * (These generally do not improve data compression for our desired + * use.) + * + * 2) It optionally allows interleaving of multiple huffman trees for + * a single data stream. NB: when multiple codes are used this is + * incompatible with RFC1951. + * + * It has been written here, instead of using zlib, so that we can separate + * out the encoding of the huffman tree from the compression of the data + * stream into separate memory sections with the intent to optimise + * compression of very small blocks of data by sharing one set of frequency + * tables (ie huffman tree) with multiple sets of compressed data blocks. + * + * James Bonfield, 2007 + */ + +#define NDEBUG /* disable asserts for production use */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <unistd.h> +#include <assert.h> +#include <unistd.h> + +#include "io_lib/deflate_interlaced.h" + +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + +/* #define TEST_MAIN */ + +/* + * --------------------------------------------------------------------------- + * Local structs & defines + */ + +/* Used in tree construction only */ +typedef struct node { + int count; + int sym; /* char or SYM_EOF */ + struct node *parent; + struct node *next; +} node_t; + +#define SYM_EOF 256 + +static void output_code_set(FILE *fp, huffman_codes_t *codes); +static void output_code_set2(FILE *fp, huffman_codes_t *codes); +int next_symbol(block_t *in, int *htab); + +/* + * --------------------------------------------------------------------------- + * Our standard precomputed tables, for DNA, English text, etc. + */ + +/* DNA */ +/* + * A 00 + * C 01 + * G 110 + * T 10 + * N 1110 + * EOF 11110 + * ? 11111* + */ +static huffman_code_t codes_dna[] = { + {'A', 2}, {'C', 2}, {'T', 2}, {'G', 3}, {'N', 4}, { 0, 5}, + {SYM_EOF, 6}, + { 1, 13}, { 2, 13}, { 3, 13}, { 4, 13}, { 5, 13}, { 6, 13}, + { 7, 14}, { 8, 14}, { 9, 14}, { 10, 14}, { 11, 14}, { 12, 14}, + { 13, 14}, { 14, 14}, { 15, 14}, { 16, 14}, { 17, 14}, { 18, 14}, + { 19, 14}, { 20, 14}, { 21, 14}, { 22, 14}, { 23, 14}, { 24, 14}, + { 25, 14}, { 26, 14}, { 27, 14}, { 28, 14}, { 29, 14}, { 30, 14}, + { 31, 14}, { 32, 14}, { 33, 14}, { 34, 14}, { 35, 14}, { 36, 14}, + { 37, 14}, { 38, 14}, { 39, 14}, { 40, 14}, { 41, 14}, { 42, 14}, + { 43, 14}, { 44, 14}, { 45, 14}, { 46, 14}, { 47, 14}, {'0', 14}, + {'1', 14}, {'2', 14}, {'3', 14}, {'4', 14}, {'5', 14}, {'6', 14}, + {'7', 14}, {'8', 14}, {'9', 14}, { 58, 14}, { 59, 14}, { 60, 14}, + { 61, 14}, { 62, 14}, { 63, 14}, { 64, 14}, {'B', 14}, {'D', 14}, + {'E', 14}, {'F', 14}, {'H', 14}, {'I', 14}, {'J', 14}, {'K', 14}, + {'L', 14}, {'M', 14}, {'O', 14}, {'P', 14}, {'Q', 14}, {'R', 14}, + {'S', 14}, {'U', 14}, {'V', 14}, {'W', 14}, {'X', 14}, {'Y', 14}, + {'Z', 14}, { 91, 14}, { 92, 14}, { 93, 14}, { 94, 14}, { 95, 14}, + { 96, 14}, {'a', 14}, {'b', 14}, {'c', 14}, {'d', 14}, {'e', 14}, + {'f', 14}, {'g', 14}, {'h', 14}, {'i', 14}, {'j', 14}, {'k', 14}, + {'l', 14}, {'m', 14}, {'n', 14}, {'o', 14}, {'p', 14}, {'q', 14}, + {'r', 14}, {'s', 14}, {'t', 14}, {'u', 14}, {'v', 14}, {'w', 14}, + {'x', 14}, {'y', 14}, {'z', 14}, {123, 14}, {124, 14}, {125, 14}, + {126, 14}, {127, 14}, {128, 14}, {129, 14}, {130, 14}, {131, 14}, + {132, 14}, {133, 14}, {134, 14}, {135, 14}, {136, 14}, {137, 14}, + {138, 14}, {139, 14}, {140, 14}, {141, 14}, {142, 14}, {143, 14}, + {144, 14}, {145, 14}, {146, 14}, {147, 14}, {148, 14}, {149, 14}, + {150, 14}, {151, 14}, {152, 14}, {153, 14}, {154, 14}, {155, 14}, + {156, 14}, {157, 14}, {158, 14}, {159, 14}, {160, 14}, {161, 14}, + {162, 14}, {163, 14}, {164, 14}, {165, 14}, {166, 14}, {167, 14}, + {168, 14}, {169, 14}, {170, 14}, {171, 14}, {172, 14}, {173, 14}, + {174, 14}, {175, 14}, {176, 14}, {177, 14}, {178, 14}, {179, 14}, + {180, 14}, {181, 14}, {182, 14}, {183, 14}, {184, 14}, {185, 14}, + {186, 14}, {187, 14}, {188, 14}, {189, 14}, {190, 14}, {191, 14}, + {192, 14}, {193, 14}, {194, 14}, {195, 14}, {196, 14}, {197, 14}, + {198, 14}, {199, 14}, {200, 14}, {201, 14}, {202, 14}, {203, 14}, + {204, 14}, {205, 14}, {206, 14}, {207, 14}, {208, 14}, {209, 14}, + {210, 14}, {211, 14}, {212, 14}, {213, 14}, {214, 14}, {215, 14}, + {216, 14}, {217, 14}, {218, 14}, {219, 14}, {220, 14}, {221, 14}, + {222, 14}, {223, 14}, {224, 14}, {225, 14}, {226, 14}, {227, 14}, + {228, 14}, {229, 14}, {230, 14}, {231, 14}, {232, 14}, {233, 14}, + {234, 14}, {235, 14}, {236, 14}, {237, 14}, {238, 14}, {239, 14}, + {240, 14}, {241, 14}, {242, 14}, {243, 14}, {244, 14}, {245, 14}, + {246, 14}, {247, 14}, {248, 14}, {249, 14}, {250, 14}, {251, 14}, + {252, 14}, {253, 14}, {254, 14}, {255, 14}, +}; + +/* DNA with a few ambiguity codes */ +static huffman_code_t codes_dna_ambig[] = { + {'A', 2}, {'C', 2}, {'T', 2}, {'G', 3}, {'N', 4}, { 0, 7}, + { 45, 7}, {'B', 8}, {'D', 8}, {'H', 8}, {'K', 8}, {'M', 8}, + {'R', 8}, {'S', 8}, {'V', 8}, {'W', 8}, {'Y', 8}, {SYM_EOF, 11}, + {226, 14}, { 1, 15}, { 2, 15}, { 3, 15}, { 4, 15}, { 5, 15}, + { 6, 15}, { 7, 15}, { 8, 15}, { 9, 15}, { 10, 15}, { 11, 15}, + { 12, 15}, { 13, 15}, { 14, 15}, { 15, 15}, { 16, 15}, { 17, 15}, + { 18, 15}, { 19, 15}, { 20, 15}, { 21, 15}, { 22, 15}, { 23, 15}, + { 24, 15}, { 25, 15}, { 26, 15}, { 27, 15}, { 28, 15}, { 29, 15}, + { 30, 15}, { 31, 15}, { 32, 15}, { 33, 15}, { 34, 15}, { 35, 15}, + { 36, 15}, { 37, 15}, { 38, 15}, { 39, 15}, { 40, 15}, { 41, 15}, + { 42, 15}, { 43, 15}, { 44, 15}, { 46, 15}, { 47, 15}, {'0', 15}, + {'1', 15}, {'2', 15}, {'3', 15}, {'4', 15}, {'5', 15}, {'6', 15}, + {'7', 15}, {'8', 15}, {'9', 15}, { 58, 15}, { 59, 15}, { 60, 15}, + { 61, 15}, { 62, 15}, { 63, 15}, { 64, 15}, {'E', 15}, {'F', 15}, + {'I', 15}, {'J', 15}, {'L', 15}, {'O', 15}, {'P', 15}, {'Q', 15}, + {'U', 15}, {'X', 15}, {'Z', 15}, { 91, 15}, { 92, 15}, { 93, 15}, + { 94, 15}, { 95, 15}, { 96, 15}, {'a', 15}, {'b', 15}, {'c', 15}, + {'d', 15}, {'e', 15}, {'f', 15}, {'g', 15}, {'h', 15}, {'i', 15}, + {'j', 15}, {'k', 15}, {'l', 15}, {'m', 15}, {'n', 15}, {'o', 15}, + {'p', 15}, {'q', 15}, {'r', 15}, {'s', 15}, {'t', 15}, {'u', 15}, + {'v', 15}, {'w', 15}, {'x', 15}, {'y', 15}, {'z', 15}, {123, 15}, + {124, 15}, {125, 15}, {126, 15}, {127, 15}, {128, 15}, {129, 15}, + {130, 15}, {131, 15}, {132, 15}, {133, 15}, {134, 15}, {135, 15}, + {136, 15}, {137, 15}, {138, 15}, {139, 15}, {140, 15}, {141, 15}, + {142, 15}, {143, 15}, {144, 15}, {145, 15}, {146, 15}, {147, 15}, + {148, 15}, {149, 15}, {150, 15}, {151, 15}, {152, 15}, {153, 15}, + {154, 15}, {155, 15}, {156, 15}, {157, 15}, {158, 15}, {159, 15}, + {160, 15}, {161, 15}, {162, 15}, {163, 15}, {164, 15}, {165, 15}, + {166, 15}, {167, 15}, {168, 15}, {169, 15}, {170, 15}, {171, 15}, + {172, 15}, {173, 15}, {174, 15}, {175, 15}, {176, 15}, {177, 15}, + {178, 15}, {179, 15}, {180, 15}, {181, 15}, {182, 15}, {183, 15}, + {184, 15}, {185, 15}, {186, 15}, {187, 15}, {188, 15}, {189, 15}, + {190, 15}, {191, 15}, {192, 15}, {193, 15}, {194, 15}, {195, 15}, + {196, 15}, {197, 15}, {198, 15}, {199, 15}, {200, 15}, {201, 15}, + {202, 15}, {203, 15}, {204, 15}, {205, 15}, {206, 15}, {207, 15}, + {208, 15}, {209, 15}, {210, 15}, {211, 15}, {212, 15}, {213, 15}, + {214, 15}, {215, 15}, {216, 15}, {217, 15}, {218, 15}, {219, 15}, + {220, 15}, {221, 15}, {222, 15}, {223, 15}, {224, 15}, {225, 15}, + {227, 15}, {228, 15}, {229, 15}, {230, 15}, {231, 15}, {232, 15}, + {233, 15}, {234, 15}, {235, 15}, {236, 15}, {237, 15}, {238, 15}, + {239, 15}, {240, 15}, {241, 15}, {242, 15}, {243, 15}, {244, 15}, + {245, 15}, {246, 15}, {247, 15}, {248, 15}, {249, 15}, {250, 15}, + {251, 15}, {252, 15}, {253, 15}, {254, 15}, {255, 15}, +}; + +/* English text */ +static huffman_code_t codes_english[] = { + { 32, 3}, {'e', 3}, {'a', 4}, {'i', 4}, {'n', 4}, {'o', 4}, + {'s', 4}, {'t', 4}, {'d', 5}, {'h', 5}, {'l', 5}, {'r', 5}, + {'u', 5}, { 10, 6}, { 13, 6}, { 44, 6}, {'c', 6}, {'f', 6}, + {'g', 6}, {'m', 6}, {'p', 6}, {'w', 6}, {'y', 6}, { 46, 7}, + {'b', 7}, {'v', 7}, { 34, 8}, {'I', 8}, {'k', 8}, { 45, 9}, + {'A', 9}, {'N', 9}, {'T', 9}, { 39, 10}, { 59, 10}, { 63, 10}, + {'B', 10}, {'C', 10}, {'E', 10}, {'H', 10}, {'M', 10}, {'S', 10}, + {'W', 10}, {'x', 10}, { 33, 11}, {'0', 11}, {'1', 11}, {'F', 11}, + {'G', 11}, { 0, 15}, { 1, 15}, { 2, 15}, { 3, 15}, { 4, 15}, + { 5, 15}, { 6, 15}, { 7, 15}, { 8, 15}, { 9, 15}, { 11, 15}, + { 12, 15}, { 14, 15}, { 15, 15}, { 16, 15}, { 17, 15}, { 18, 15}, + { 19, 15}, { 20, 15}, { 21, 15}, { 22, 15}, { 23, 15}, { 24, 15}, + { 25, 15}, { 26, 15}, { 27, 15}, { 28, 15}, { 29, 15}, { 30, 15}, + { 31, 15}, { 35, 15}, { 36, 15}, { 37, 15}, { 38, 15}, { 40, 15}, + { 41, 15}, { 42, 15}, { 43, 15}, { 47, 15}, {'2', 15}, {'3', 15}, + {'4', 15}, {'5', 15}, {'6', 15}, {'7', 15}, {'8', 15}, {'9', 15}, + { 58, 15}, { 60, 15}, { 61, 15}, { 62, 15}, { 64, 15}, {'D', 15}, + {'J', 15}, {'K', 15}, {'L', 15}, {'O', 15}, {'P', 15}, {'Q', 15}, + {'R', 15}, {'U', 15}, {'V', 15}, {'X', 15}, {'Y', 15}, {'Z', 15}, + { 91, 15}, { 92, 15}, { 93, 15}, { 94, 15}, { 95, 15}, { 96, 15}, + {'j', 15}, {'q', 15}, {'z', 15}, {123, 15}, {124, 15}, {125, 15}, + {126, 15}, {127, 15}, {128, 15}, {129, 15}, {130, 15}, {131, 15}, + {132, 15}, {133, 15}, {134, 15}, {135, 15}, {136, 15}, {137, 15}, + {138, 15}, {139, 15}, {140, 15}, {141, 15}, {142, 15}, {143, 15}, + {144, 15}, {145, 15}, {146, 15}, {147, 15}, {148, 15}, {149, 15}, + {150, 15}, {151, 15}, {152, 15}, {153, 15}, {154, 15}, {155, 15}, + {156, 15}, {157, 15}, {158, 15}, {159, 15}, {160, 15}, {161, 15}, + {162, 15}, {163, 15}, {164, 15}, {165, 15}, {166, 15}, {167, 15}, + {168, 15}, {169, 15}, {170, 15}, {171, 15}, {172, 15}, {173, 15}, + {174, 15}, {175, 15}, {176, 15}, {177, 15}, {178, 15}, {179, 15}, + {180, 15}, {181, 15}, {182, 15}, {183, 15}, {184, 15}, {185, 15}, + {186, 15}, {187, 15}, {188, 15}, {189, 15}, {190, 15}, {191, 15}, + {192, 15}, {193, 15}, {194, 15}, {195, 15}, {196, 15}, {197, 15}, + {198, 15}, {199, 15}, {200, 15}, {201, 15}, {202, 15}, {203, 15}, + {204, 15}, {205, 15}, {206, 15}, {207, 15}, {208, 15}, {209, 15}, + {210, 15}, {211, 15}, {212, 15}, {213, 15}, {214, 15}, {215, 15}, + {216, 15}, {217, 15}, {218, 15}, {219, 15}, {220, 15}, {221, 15}, + {222, 15}, {223, 15}, {224, 15}, {225, 15}, {226, 15}, {227, 15}, + {228, 15}, {229, 15}, {230, 15}, {231, 15}, {232, 15}, {233, 15}, + {234, 15}, {235, 15}, {236, 15}, {237, 15}, {238, 15}, {239, 15}, + {240, 15}, {241, 15}, {242, 15}, {243, 15}, {244, 15}, {245, 15}, + {246, 15}, {247, 15}, {248, 15}, {249, 15}, {250, 15}, {251, 15}, + {252, 15}, {253, 15}, {254, 15}, {255, 15}, {SYM_EOF, 15}, +}; + +static huffman_codeset_t *static_codeset[NCODES_STATIC]; + +/* + * --------------------------------------------------------------------------- + * Block_t structure support + */ + +/* + * Allocates and returns a new block_t struct of a specified default size. + * A default 'data' pointer may be passed in, in which it must have + * been created using malloc(size). Otherwise if data is NULL then + * size indicates the amount of memory to allocate. Size maybe zero to + * defer allocation. + * + * Returns newly created block_t* on success + * NULL on failure + */ +block_t *block_create(unsigned char *data, size_t size) { + block_t *b = (block_t *)malloc(sizeof(*b)); + if (!b) + return NULL; + + b->data = data; + b->alloc = size; + b->byte = 0; + b->bit = 0; + + if (size && !data && NULL == (b->data = calloc(size, 1))) { + free(b); + return NULL; + } + + return b; +} + +/* + * Deallocates memory created by block_create(). + * keep_data is a boolean which if true requests that the data held within + * the block should not be deallocated as it is in use elsewhere. + */ +void block_destroy(block_t *blk, int keep_data) { + if (!blk) + return; + + if (!keep_data && blk->data) + free(blk->data); + + free(blk); +} + +/* + * Ensures a block_t holds at least 'size' bytes. + * Newly allocated data is initialised to zero. + * + * Returns 0 on success + * -1 on failure, leaving block pointing to the existing data + */ +int block_resize(block_t *blk, size_t size) { + unsigned char *newp = NULL; + + if (!blk) + return -1; + + /* Grow size to next power of 2, if we're growing */ + if (size > blk->alloc) { + size--; + size |= size >> 1; + size |= size >> 2; + size |= size >> 4; + size |= size >> 8; + size |= size >> 16; + size++; + } + + if (NULL == (newp = realloc(blk->data, size))) + return -1; + else + blk->data = newp; + + if (size > blk->alloc) + memset(&blk->data[blk->alloc], 0, size - blk->alloc); + blk->alloc = size; + + return 0; +} + + +/* + * --------------------------------------------------------------------------- + * Tree building and code generation functions + */ + +/* + * Reverses the order of bits in the bottom nbits of val. + * Returns the bit-reverse value. + */ +unsigned int bit_reverse(unsigned int val, int nbits) { + unsigned int new = 0, i; + + for (i = 0; i < nbits; i++) { + new = (new << 1) | (val & 1); + val >>= 1; + } + + return new; +} + + +/* + * Generates canonical huffman codes given a set of symbol bit lengths. + * The results are stored within the supplied huffman_codes_t struct. + * + * Returns 0 on success + * -1 on failure + */ +static int canonical_codes(huffman_codes_t *c) { + int i, j; + unsigned int code, last_len; + int clens[33]; + int offs[33]; + huffman_code_t ctmp[258]; + signed int symtab[258]; + + /* Sort by bit-length, subfield symbol - much faster than qsort() */ + for (i = 0; i < 258; i++) + symtab[i] = -1; + for (i = 0; i < c->ncodes; i++) + symtab[c->codes[i].symbol] = i; + for (i = 0; i <= 32; i++) + offs[i] = clens[i] = 0; + for (i = 0; i < c->ncodes; i++) + clens[c->codes[i].nbits]++; + for (i = 1; i <= 32; i++) + offs[i] = offs[i-1] + clens[i-1]; + for (i = 0; i < 258; i++) { + if (symtab[i] != -1) + ctmp[offs[c->codes[symtab[i]].nbits]++] = c->codes[symtab[i]]; + } + memcpy(c->codes, ctmp, c->ncodes * sizeof(huffman_code_t)); + + /* + * Force all codes to be <= max_code_len. This is needed due to the + * 15-bit length limitation of Deflate literal codes and the 7-bit + * limit of the code bit-length table. + */ + /* Find first point of failure */ + for (i = 0; i < c->ncodes; i++) { + if (c->codes[i].nbits > c->max_code_len) + break; + } + + /* + * From here on we shrink the length of the current code by increasing + * the length of an earlier symbol, at last_code. + */ + if (i != c->ncodes) { + int delta = 0; + + /* + fprintf(stderr, "=== REORDERING %d ===\n", c->code_set); + output_code_set(stderr, c); + output_code_set2(stderr, c); + */ + + for (; i < c->ncodes; i++) { + int k, cur_len; + + c->codes[i].nbits -= delta; + if (c->codes[i].nbits <= c->max_code_len) + continue; + + for (j = i; j >= 0 && c->codes[j].nbits >= c->max_code_len; j--) + ; + if (j < 0) { + fprintf(stderr, + "Too many symbols to fit in bit-length requirements\n"); + fprintf(stderr, "=== FAILING ===\n"); + output_code_set(stderr, c); + output_code_set2(stderr, c); + abort(); + } + + /* + fprintf(stderr, "Changing code %d/%d to len %d\n", + c->codes[i].symbol, c->codes[j].symbol, + c->codes[j].nbits+1); + */ + cur_len = c->codes[i].nbits; + c->codes[i].nbits = ++c->codes[j].nbits; + + /* + * Shrink the next code by one, or if none at that bit-length + * the next 2, and so on + */ + delta = 1; + for (k = i+1; delta && k < c->ncodes; k++) { + while (c->codes[k].nbits > cur_len) { + delta *= 2; + cur_len++; + } + c->codes[k].nbits--; + delta--; + } + assert(delta == 0); + } + + /* + fprintf(stderr, "=== REORDERED TO %d ===\n", c->code_set); + output_code_set(stderr, c); + output_code_set2(stderr, c); + */ + + /* Ordering is shot - regenerate via brute force way */ + return canonical_codes(c); + } + + + /* Generate codes */ + code = last_len = 0; /* stop warning */ + for (i = 0; i < c->ncodes; i++) { + int nbits = c->codes[i].nbits; + + if (i == 0) { + code = 0; + last_len = nbits; + } else { + code++; + } + if (nbits > last_len) { + code <<= (nbits - last_len); + last_len = nbits; + } + c->codes[i].code = bit_reverse(code, nbits); + } + + /* Reindex so the symbol is the primary index into codes */ + for (i = 0; i <= 257; i++) { + c->lookup[i].nbits = 0; + } + for (i = 0; i < c->ncodes; i++) { + c->lookup[c->codes[i].symbol] = c->codes[i]; + } + + return 0; +} + +static int node_compar2(const void *vp1, const void *vp2) { + const node_t *n1 = *(const node_t **)vp1; + const node_t *n2 = *(const node_t **)vp2; + + /* + * The sort order is vital here. This needs to return the same collating + * order on all systems so that differing qsort() functions will not + * swap around symbols with the same bit lengths, hence we sort by both + * fields to force a unique stable ordering. + */ + if (n1->count != n2->count) + return n1->count - n2->count; + else + return n2->sym - n1->sym; +} + +/* + * Computes the huffman bit-lengths for a data set. We don't care + * about the actual tree, just how deep the symbols end up. + * + * Huffman trees are constructed by constructing a set of nodes + * initially containing the symbol and it's frequency. We then merge + * the two least used nodes to produce a new node with a combined + * frequency. Repeat until one root node is left. + * + * data/len is the input data to analyse. + * + * 'eof' is a boolean to indicate whether the EOF symbol should be included + * in the symbols produced. + * + * all_codes is a boolean to indicate whether we should include symbols not + * found in the input data set. (This was used to create the static lookup + * tables.) + * + * Returns huffman_codes_t* on success + * NULL on failure + */ +huffman_codes_t *calc_bit_lengths(unsigned char *data, int len, + int eof, int max_code_len, int all_codes, + int start, int skip) { + int i, ncodes; + node_t nodes[258+257], *head, *new = &nodes[258]; + node_t *n2[258+257]; + int map[258]; + huffman_codes_t *c; + int hist[256]; + + if (NULL == (c = (huffman_codes_t *)malloc(sizeof(*c)))) + return NULL; + c->codes_static = 0; + c->max_code_len = max_code_len; + + /* Count frequencies of symbols */ + memset(hist, 0, 256*sizeof(*hist)); + /* Calc freqs */ + for (i = start; i < len; i+=skip) { + hist[data[i]]++; + } + + + /* + * Initialise nodes. We build a map of ASCII character code to node + * number. (By default it's a simple 1:1 mapping unless legal_chars is + * defined.) + */ + ncodes = 0; + if (eof) { + nodes[ncodes].sym = SYM_EOF; + nodes[ncodes].count = eof; + nodes[ncodes].parent = NULL; + n2[ncodes] = &nodes[ncodes]; + map[SYM_EOF] = ncodes++; + } + + /* All 256 chars existing at a minimal level */ + if (all_codes) { + for (i = 0; i < 256; i++) { + nodes[ncodes].sym = i; + nodes[ncodes].count = hist[i]; + nodes[ncodes].parent = NULL; + n2[ncodes] = &nodes[ncodes]; + map[i] = ncodes++; + } + } else { + /* Only include non-zero symbols */ + for (i = 0; i < 256; i++) { + if (hist[i] == 0) + continue; + nodes[ncodes].sym = i; + nodes[ncodes].count = hist[i]; + nodes[ncodes].parent = NULL; + n2[ncodes] = &nodes[ncodes]; + map[i] = ncodes++; + } + } + + /* Sort by counts, smallest first and form a sorted linked list */ + qsort(n2, ncodes, sizeof(*n2), node_compar2); + + /* Skip symbols that do not occur, unless all_codes is true */ + for (i = 0; i < ncodes; i++) { + n2[i]->next = i+1 < ncodes ? n2[i+1] : NULL; + } + + /* Repeatedly merge two smallest values */ + head = n2[0]; + while (head && head->next) { + node_t *after = head->next, *n; + int sum = head->count + head->next->count; + + for (n = head->next->next; n; after = n, n = n->next) { + if (sum < n->count) + break; + } + + /* Produce a new summation node and link it in place */ + after->next = new; + new->next = n; + new->sym = '?'; + new->count = sum; + new->parent = NULL; + head->parent = new; + head->next->parent = new; + head = head->next->next; + + new++; + } + + /* Walk up tree computing the bit-lengths for our symbols */ + c->ncodes = ncodes; + c->codes = (huffman_code_t *)malloc(c->ncodes * sizeof(*c->codes)); + if (NULL == c->codes) { + free(c); + return NULL; + } + + for (i = 0; i < ncodes; i++) { + int len = 0; + node_t *n; + for (n = n2[i]->parent; n; n = n->parent) { + len++; + } + + c->codes[i].symbol = n2[i]->sym; + c->codes[i].freq = n2[i]->count; + c->codes[i].nbits = len ? len : 1; /* special case, nul input */ + } + + if (0 != canonical_codes(c)) { + free(c); + return NULL; + } + + return c; +} + +/* + * A special case of the generate_code_set() function below, but for + * creating predefined code sets from bit-length arrays. Useful for + * code that wants to use a predetermined huffman tree. + * + * + * Returns huffman_codes_t* on success; free using huffman_codes_destroy(). + * NULL on failure. + */ +huffman_codeset_t *codes2codeset(huffman_code_t *codes, int ncodes, + int code_num) { + huffman_codeset_t *cs; + huffman_codes_t *c; + + if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs)))) + return NULL; + + if (NULL == (c = (huffman_codes_t *)malloc(sizeof(*c)))) + return NULL; + + cs->codes = (huffman_codes_t **)malloc(sizeof(*cs->codes)); + cs->codes[0] = c; + cs->ncodes = 1; + cs->code_set = code_num; + cs->blk = NULL; + cs->bit_num = 0; + cs->decode_t = NULL; + cs->decode_J4 = NULL; + + c->codes_static = 1; + c->max_code_len = MAX_CODE_LEN; + + c->codes = codes; + c->ncodes = ncodes; + + cs->bit_num = 0; /* FIXME: need to know this */ + + canonical_codes(c); + + return cs; +} + +/* + * Initialises and returns a huffman_codes_t struct from a specified code_set. + * If code_set is not one of the standard predefined values then the + * input data is analysed using calc_bit_lengths() above to produce the + * optimal set of huffman codes, otherwise we return predefined values. + * + * 'eof' is a boolean to indicate whether the EOF symbol should be included + * in the symbols produced. + * + * all_codes is a boolean to indicate whether we should include symbols not + * found in the input data set. (This was used to create the static lookup + * tables.) + * + * Returns huffman_codes_t* on success; free using huffman_codes_destroy(). + * NULL on failure. + */ +huffman_codeset_t *generate_code_set(int code_set, int ncodes, + unsigned char *data, int len, + int eof, int max_code_len, + int all_codes) { + huffman_codeset_t *cs; + + /* + * Either INLINE or a CODE_USER+ set of codes. + * => analyse the data and compute a new set of bit-lengths & codes. + */ + if (code_set >= 128 || code_set == CODE_INLINE) { + int i; + + if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs)))) + return NULL; + + cs->code_set = code_set; + cs->ncodes = ncodes; + cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes)); + cs->blk = NULL; + cs->bit_num = 0; + cs->decode_t = NULL; + cs->decode_J4 = NULL; + + for (i = 0; i < ncodes; i++) { + /* + * If requested, include EOF all code sets, but at a + * frequency of only '1' occurrance where we predict it + * not to be needed. + */ + if (eof && (len+i)%ncodes) + eof = 1; + cs->codes[i] = calc_bit_lengths(data, len, eof, max_code_len, + all_codes, i, ncodes); + cs->codes[i]->codes_static = 0; + if (NULL == cs->codes[i]) { + /* FIXME: tidy up */ + return NULL; + } + + canonical_codes(cs->codes[i]); + } + + /* + * Otherwise we use the determined codes at the top of this file, such + * as codes_dna and codes_english. + */ + } else { + if (code_set < 1 || code_set >= NCODES_STATIC) { + fprintf(stderr, "Unknown huffman code set '%d'\n", code_set); + return NULL; + } + + /* If our global codeset hasn't been initialised yet, do so */ + if (!static_codeset[code_set]) { + huffman_codes_t *c = (huffman_codes_t *)malloc(sizeof(*c)); + + if (NULL == (cs = (huffman_codeset_t *)malloc(sizeof(*cs)))) + return NULL; + + cs->codes = (huffman_codes_t **)malloc(sizeof(*cs->codes)); + cs->codes[0] = c; + cs->ncodes = 1; + cs->code_set = code_set; + cs->blk = NULL; + cs->bit_num = 0; + cs->decode_t = NULL; + cs->decode_J4 = NULL; + + c->codes_static = 1; + c->max_code_len = MAX_CODE_LEN; + + switch(code_set) { + case CODE_DNA: + c->codes = codes_dna; + c->ncodes = sizeof(codes_dna)/sizeof(*c->codes); + cs->bit_num = 5; + break; + + case CODE_DNA_AMBIG: + c->codes = codes_dna_ambig; + c->ncodes = sizeof(codes_dna_ambig)/sizeof(*c->codes); + cs->bit_num = 1; + break; + + case CODE_ENGLISH: + c->codes = codes_english; + c->ncodes = sizeof(codes_english)/sizeof(*c->codes); + cs->bit_num = 1; + break; + + default: + fprintf(stderr, "Unknown huffman code set '%d'\n", code_set); + return NULL; + } + + canonical_codes(c); + + static_codeset[code_set] = cs; + } + + cs = static_codeset[code_set]; + } + + return cs; +} + +void huffman_codes_destroy(huffman_codes_t *c) { + if (!c) + return; + + if (!c->codes_static && c->codes) + free(c->codes); + + free(c); +} + +void huffman_codeset_destroy(huffman_codeset_t *cs) { + int i; + + if (!cs) + return; + + /* If this codeset is one of the predefined global ones we do nothing */ + if (cs->ncodes == 1 && cs->codes[0]->codes_static) + return; + + for (i = 0; i < cs->ncodes; i++) + huffman_codes_destroy(cs->codes[i]); + if (cs->codes) + free(cs->codes); + if (cs->blk) + block_destroy(cs->blk, 0); + if (cs->decode_t) + free(cs->decode_t); + if (cs->decode_J4) + free(cs->decode_J4); + + free(cs); +} + +/* + * --------------------------------------------------------------------------- + * Encoding and decoding related functions + */ + +/* + * Can store up to 24-bits worth of data encoded in an integer value + * Possibly we'd want to have a less optimal store_bits function when dealing + * with nbits > 24, but for now we assume the codes generated are never + * that big. (Given this is only possible with 121392 or more + * characters with exactly the correct frequency distribution we check + * for it elsewhere.) + */ +static void store_bits(block_t *block, unsigned int val, int nbits) { + /* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */ + +#if 1 + { + unsigned int curr = block->data[block->byte]; + curr |= (val & ((1 << nbits)-1)) << block->bit; + block->bit += nbits; + while (block->bit >= 8) { + block->data[block->byte++] = curr & 0xff; + curr >>= 8; + block->bit -= 8; + } + block->data[block->byte] = curr & 0xff; + } + return; + +#else + + { + /* Slow, but simple */ + unsigned int mask = 1; + int bit = 1 << block->bit; + do { + if (val & mask) + block->data[block->byte] |= bit; + /* + * Data should be zeroed anyway, so this is not needed. + * + * else + * block->data[block->byte] &= ~bit; + */ + + if (++block->bit == 8) { + block->bit = 0; + block->byte++; + block->data[block->byte] = 0; + bit = 1; + } else { + bit <<= 1; + } + mask <<= 1; + } while(--nbits); + } + +#endif +} + +/* + * Reads up to 24-bits worth of data and returns. Updates the block + * byte and bit values to indicate the current 'read' position. + * + * Returns unsigned value on success (>=0) + * -1 on failure + */ +static signed int get_bits(block_t *block, int nbits) { + unsigned int val, bnum = 0; + + if (block->byte*8 + block->bit + nbits > block->alloc * 8) + return -1; + + /* Fetch the partial byte of data */ + val = (block->data[block->byte]) >> block->bit; + bnum = 8 - block->bit; + + /* And additional entire bytes worth as required */ + while (bnum <= nbits) { + val |= block->data[++block->byte] << bnum; + bnum += 8; + } + + block->bit = (block->bit + nbits) % 8; + return val & ((1 << nbits) - 1); +} + +/* stores nbytes bytes, padding to align on the next byte boundary */ +void store_bytes(block_t *block, unsigned char *val, int nbytes) { + /* Align */ + if (block->bit != 0) { + block->byte++; + block->bit = 0; + } + + /* Resize */ + block_resize(block, block->byte + nbytes + 1); + + /* Store */ + memcpy(&block->data[block->byte], val, nbytes); + block->byte += nbytes; +} + + +/* + * Encodes the huffman symbol bit-lengths as a serialised block of data + * suitable for storing in a ZTR "ZLBH" chunk. This uses the Deflate + * storage format defined in RFC1951. + * + * Returns: 0 on success + * -1 on failure + */ +int store_codes_single(block_t *out, huffman_codes_t *c) { + int i; + unsigned char bl_code[257]; /* bit-length codes and for codes 16-18 */ + unsigned char bl_opt[257]; /* the operand to the blcode */ + unsigned char sorted_codes[258]; + int bl_freq[19]; /* frequency of bit-length codes produced */ + int bl_count; + huffman_codes_t *bl_cds = NULL; + int hclen_order[] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + int hlit, hdist, hclen, hcmap[19]; + + /* output_code_set (stderr, c); */ + + if (out->alloc < out->byte + 1000) { + out->alloc = out->byte + 1000; + if (NULL == (out->data = realloc(out->data, out->alloc))) + return -1; + } + + /* + *----------------------------------------------------------------- + * Reformat the dynamic code bit-lengths into an alphabet of 19 + * "code length" symbols as defined in RFC1951. + */ + memset(sorted_codes, 0, 258); + for (i = 0; i < c->ncodes; i++) { + sorted_codes[c->codes[i].symbol] = c->codes[i].nbits; + } + for (i = 0; i < 19; i++) + bl_freq[i] = 0; + + bl_count = 0; + for (i = 0; i < 257; ) { + int j = i+1, n; + int v = sorted_codes[i]; + while (j < 257 && sorted_codes[j] == v) + j++; + + n = j-i; /* n = run-length */ + /* fprintf(stderr, "value=%d, run_len=%d\n", v, n); */ + if (v == 0) { + /* bit-len zero => no code and uses code 17/18 for run len */ + while (n > 0) { + while (n >= 11) { + bl_freq[18]++; + bl_code[bl_count] = 18; + bl_opt[bl_count] = MIN(n, 138)-11; + n -= bl_opt[bl_count]+11; + bl_count++; + } + + while (n >= 3) { + bl_freq[17]++; + bl_code[bl_count] = 17; + bl_opt[bl_count] = MIN(n, 10)-3; + n -= bl_opt[bl_count]+3; + bl_count++; + } + + switch (n) { + case 2: bl_code[bl_count++] = 0; bl_freq[0]++; n--; + case 1: bl_code[bl_count++] = 0; bl_freq[0]++; n--; + } + } + } else if (v <= 15) { + /* non-zero code, uses code 16 for run-len */ + if (n >= 4) { + bl_freq[v]++; + bl_code[bl_count++] = v; + n--; + while (n >= 3) { + bl_freq[16]++; + bl_code[bl_count] = 16; + bl_opt[bl_count] = MIN(n, 6)-3; + n -= bl_opt[bl_count]+3; + bl_count++; + } + } + + switch(n) { + case 3: bl_code[bl_count++] = v; bl_freq[v]++; n--; + case 2: bl_code[bl_count++] = v; bl_freq[v]++; n--; + case 1: bl_code[bl_count++] = v; bl_freq[v]++; n--; + } + } else { + fprintf(stderr, "Unsupported code length: %d\n", v); + } + + i = j; + } + hlit = 257; + + /* Add a single distance code of zero bits. This means that there + * are no distance codes used at all. + */ + bl_code[bl_count++] = 0; + bl_freq[0]++; + hdist = 1; + + /* Produce new huffman codes for our code-length symbols. */ + bl_cds = calc_bit_lengths(bl_code, bl_count, 0, 7, 0, 0, 1); + /* output_code_set (stderr, bl_cds); */ + + /* + *----------------------------------------------------------------- + * Output the "code length" bit-lengths, 3 bits at a time. + * + * Compute how many HCLEN code length values we need, using the + * predefined order in the RFC. + */ + for (hclen = 19; hclen > 0; hclen--) { + if (bl_freq[hclen_order[hclen-1]]) + break; + } + + store_bits(out, hlit-257, 5); + store_bits(out, hdist-1, 5); + store_bits(out, hclen-4, 4); + + for (i = 0; i < 19; i++) + hcmap[i] = -1; + for (i = 0; i < bl_cds->ncodes; i++) + hcmap[bl_cds->codes[i].symbol] = i; + for (i = 0; i < hclen; i++) { + if (hcmap[hclen_order[i]] >= 0) { + store_bits(out, bl_cds->codes[hcmap[hclen_order[i]]].nbits, 3); + } else { + store_bits(out, 0, 3); + } + } + + + /* + *---------------------------------------------------------------- + * Finally output the original bit-lengths using the code-len codes. + */ + for (i = 0; i < bl_count; i++) { + huffman_code_t *c = &bl_cds->codes[hcmap[bl_code[i]]]; + store_bits(out, c->code, c->nbits); + /* + fprintf(stderr, "bl_code %d (opt %d), code %d/%d\n", + bl_code[i], bl_opt[i], c->code, c->nbits); + */ + switch(bl_code[i]) { + case 18: + store_bits(out, bl_opt[i], 7); + break; + case 17: + store_bits(out, bl_opt[i], 3); + break; + case 16: + store_bits(out, bl_opt[i], 2); + break; + } + } + + huffman_codes_destroy(bl_cds); + + return 0; +} + +/* + * A wrapper around store_codes_single to output either a single or multiple + * huffman codes to a block. + * + * This also creates a new block and fills out the block header appropriately. + * + * Returns 0 on success + * -1 on failure + */ +int store_codes(block_t *out, huffman_codeset_t *cs, int last_block) { + int i; + + if (out->alloc < out->byte + 1000) { + out->alloc = out->byte + 1000; + if (NULL == (out->data = realloc(out->data, out->alloc))) + return -1; + } + + /* Header details */ + store_bits(out, last_block != 0, 1); /* last block */ + if (cs->ncodes == 1) { + store_bits(out, 2, 2); /* dynamic huffman */ + } else { + int nbits = 0; + store_bits(out, 3, 2); /* multiple tree dynamic huffman */ + while (1<<nbits <= cs->ncodes-1) + nbits++; + store_bits(out, nbits-1, 4); + store_bits(out, cs->ncodes-1, nbits); + } + + for (i = 0; i < cs->ncodes; i++) { + if (-1 == store_codes_single(out, cs->codes[i])) + return -1; + } + + return 0; +} + +/* + * This is the opposite of the store_codes() function. It loads generates + * huffman_codes_t structs from the a serialised data stream as presented + * in the above format. + * + * The input data is the data-string. On return the number of bytes + * consumed will be returned in *len_used (if non NULL). + * This is to allow stripping off of the huffman codes from a longer + * array of data (ie probably followed by the STHUFF encoded chunk + * itself). + * + * Returns: malloced huffman_codes_t structure on success. + * NULL on failure. + */ +huffman_codes_t *restore_codes_single(block_t *block) { + int hlit, hdist, hclen; + int hclen_order[19] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + int hc_bitlen[19], i; + huffman_codes_t *bl_cds, *cds; + int sym, sym_val, last_len; + int htab[256]; + + hlit = get_bits(block, 5)+257; + hdist = get_bits(block, 5)+1; + hclen = get_bits(block, 4)+4; + + /* + fprintf(stderr, "bfinal = %d, btype=%d\n", *bfinal, btype); + fprintf(stderr, "hlit=0x%x, hdist=0x%x, hclen=0x%x\n", + hlit, hdist, hclen); + */ + + /* Read HCLEN code-lengths and construct huffman codes from them */ + for (i = 0; i < hclen; i++) + hc_bitlen[hclen_order[i]] = get_bits(block, 3); + for (; i < 19; i++) + hc_bitlen[hclen_order[i]] = 0; + + bl_cds = (huffman_codes_t *)malloc(sizeof(*bl_cds)); + bl_cds->codes_static = 0; + bl_cds->ncodes = 0; + bl_cds->codes = (huffman_code_t *)malloc(19 * sizeof(*bl_cds->codes)); + bl_cds->max_code_len = 7; + for (i = 0; i < 19; i++) { + if (hc_bitlen[i]) { + bl_cds->codes[bl_cds->ncodes].symbol = i; + bl_cds->codes[bl_cds->ncodes].nbits = hc_bitlen[i]; + bl_cds->ncodes++; + } + } + + canonical_codes(bl_cds); + /* output_code_set (stderr, bl_cds); */ + + /* Build a lookup table of possible codes to symbols */ + for (i = 0; i < 256; i++) + htab[i] = -1; + for (i = 0; i < bl_cds->ncodes; i++) { + htab[bit_reverse(bl_cds->codes[i].code, bl_cds->codes[i].nbits) + | (1<<bl_cds->codes[i].nbits)] + = bl_cds->codes[i].symbol; + } + + /* Now decode the next HLIT literal codes using bl_cds */ + cds = (huffman_codes_t *)malloc(sizeof(*cds)); + cds->codes_static = 0; + cds->ncodes = 0; + cds->codes = (huffman_code_t *)malloc(257 * sizeof(*cds->codes)); + cds->max_code_len = 15; + sym_val = last_len = 0; + while ((sym = next_symbol(block, htab)) != -1) { + int count; + /* fprintf(stderr, "LIT Sym=%d\n", sym); */ + + switch(sym) { + case 16: + count = get_bits(block, 2) + 3; + /* fprintf(stderr, " +%d\n", count); */ + for (i = 0; i < count; i++) { + cds->codes[cds->ncodes].symbol = sym_val++; + cds->codes[cds->ncodes++].nbits = last_len; + } + break; + + case 17: + count = get_bits(block, 3) + 3; + /* fprintf(stderr, " +%d\n", count); */ + sym_val += count; + last_len = 0; + break; + + case 18: + count = get_bits(block, 7) + 11; + /* fprintf(stderr, " +%d\n", count); */ + sym_val += count; + last_len = 0; + break; + + case 0: + sym_val++; + last_len = 0; + break; + + default: + cds->codes[cds->ncodes].symbol = sym_val++; + last_len = cds->codes[cds->ncodes++].nbits = sym; + } + + if (sym_val >= hlit) + break; + } + assert(sym != -1); + assert(cds->ncodes <= 257); + + /* Skip HDIST codes. Hopefully only 1 of zero length */ + sym_val = 0; + while ((sym = next_symbol(block, htab)) != -1) { + /* fprintf(stderr, "DIST Sym=%d\n", sym); */ + + switch(sym) { + case 16: + sym_val += get_bits(block, 2) + 3; + break; + + case 17: + sym_val += get_bits(block, 3) + 3; + break; + + case 18: + sym_val += get_bits(block, 7) + 11; + break; + + default: + sym_val++; + } + + if (sym_val >= hdist) + break; + } + assert(sym != -1); + + huffman_codes_destroy(bl_cds); + canonical_codes(cds); + /* output_code_set(stderr, cds); */ + + return cds; +} + +huffman_codeset_t *restore_codes(block_t *block, int *bfinal) { + int btype; + huffman_codeset_t *cs; + + /* Header details */ + if (bfinal) + *bfinal = get_bits(block, 1); + else + get_bits(block, 1); + btype = get_bits(block, 2); + + cs = (huffman_codeset_t *)malloc(sizeof(*cs)); + cs->code_set = 0; + cs->blk = NULL; + cs->bit_num = 0; + cs->decode_t = NULL; + cs->decode_J4 = NULL; + + if (btype == 2) { + /* Standard Deflate algorithm */ + cs->ncodes = 1; + cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes)); + cs->codes[0] = restore_codes_single(block); + } else if (btype == 3) { + /* Deflate extension - multiple codes */ + int nbits, i; + nbits = get_bits(block, 4) + 1; + cs->ncodes = get_bits(block, nbits) + 1; + cs->codes = (huffman_codes_t **)malloc(cs->ncodes*sizeof(*cs->codes)); + for (i = 0; i < cs->ncodes; i++) { + cs->codes[i] = restore_codes_single(block); + } + } else { + fprintf(stderr, "restore_codes() only implemented for " + "BTYPE == DYNAMIC HUFFMAN and INTERLACED HUFFMAN\n"); + return NULL; + } + + cs->bit_num = block->bit; + + return cs; +} + +/* + * Given a multiple sets of huffman codes and a block of data this + * compresses and returns the data block. It iterates around each set + * of huffman codes in a cyclic fashion encoding each byte with the + * appropriate huffman codes. + * + * Returns: 0 on success + * -1 on failure + */ +int huffman_multi_encode(block_t *blk, huffman_codeset_t *cs, + int code_set, unsigned char *data, int len) { + int i, nc; + huffman_code_t *lookup; + huffman_codes_t **c; + + if (!cs) { + /* No codes known, so derive our own */ + fprintf(stderr, "FIXME: use generate_code_set() to build our own codes here\n"); + return -1; + } else { + c = cs->codes; + nc = cs->ncodes; + } + + /* + * The maximum size to encode len bytes is < 9 bits per symbol + * (not quite 8 due to an EOF symbol) plus the overhead of the bit-length + * tree. That in turn, with alternating 8/9 bit-lengths would max out + * as 258*8 + 5+5+4 + 19*3 + 258*5 bits (429 bytes), but in practice + * I'm not even sure if it's possible to construct such a set of code + * lengths that would compress that poor. + * + * This of course assumes we're using appropriate compression codes. + * Given a user may give a completely inappropriate code we have to + * assume every symbol is actually 15 bits instead of < 9 on average. + * + * We ensure blk here is large enough for the worst case scenario so we + * don't incur overheads in store_bits(). + */ + if (blk->alloc <= 429 + 2*len + 2 + blk->byte) { + blk->alloc = 429 + 2*len + 2 + blk->byte; + blk->data = realloc(blk->data, blk->alloc); + if (!blk->data) + return -1; + } + + /* + * Splitting this special case out is worth it as it's approx 30% faster. + * Also note that the nc > 1 case is faster with a separate counter and + * test than using modulus (by a factor of 2). It could be sped up further + * for powers of 2 using bitwise AND, but the difference is not huge. + */ + if (nc == 1) { + lookup = c[0]->lookup; + for (i = 0; i < len; i++) { + assert(lookup[data[i]].nbits > 0); + store_bits(blk, lookup[data[i]].code, + lookup[data[i]].nbits); + } + } else { + int count = 0; + for (i = 0; i < len; i++) { + lookup = c[count]->lookup; + assert(lookup[data[i]].nbits > 0); + store_bits(blk, lookup[data[i]].code, + lookup[data[i]].nbits); + if (++count == nc) + count = 0; + } + } + + lookup = c[i%nc]->lookup; + store_bits(blk, lookup[SYM_EOF].code, lookup[SYM_EOF].nbits); + + assert(blk->alloc > blk->byte); + + /* We probably massively overallocated, so return some of it back */ + blk->data = realloc(blk->data, blk->byte+1); + blk->alloc = blk->byte+1; + + return 0; +} + +/* + * The opposite of huffman_encode(). + * Decode a huffman stream from 'block' using huffman codes 'c'. + * + * Returns: allocated block_t pointer on success + * NULL on failure. + * + * Method 1 + * -------- + * + * At any node in our tree we can precompute a lookup table so that upon + * reading the next 'k' bits we know the new node we'd end up in and what + * symbols to export. + * Then decoding simply works in fixed sets of k bits at a time. + * + * We use k=4 for efficient table space (they fit neatly in cache) and ease + * of decoding 4-bits at a time. k=8 is about 20% faster as reading the input + * byte by byte is easy, but the setup time is substantially longer + * (16x at a guess) and the lookup tables no longer fit in the L1 cache. + */ +block_t *huffman_decode(block_t *in, huffman_codes_t *c) { + block_t *out; + htree_t t[513]; /* number of internal nodes */ + int i, j, n; + int new_node, node_num; + h_jump4_t J4[513][16]; + unsigned char *cp; + + if (NULL == (out = block_create(NULL, 8*in->alloc+8))) { + block_destroy(in, 0); + return NULL; + } + + /* Construct the tree from the codes */ + new_node = 1; + t[0].l[0] = t[0].l[1] = -1; + t[0].c[0] = t[0].c[1] = 0; + for (i = 0; i < c->ncodes; i++) { + int n = 0; + unsigned int v = c->codes[i].code; + + for (j = 0; j < c->codes[i].nbits-1; j++) { + int b = v & 1; + if (t[n].c[b]) { + n = t[n].c[b]; + } else { + n = (t[n].c[b] = new_node++); + t[n].c[0] = t[n].c[1] = 0; + t[n].l[0] = t[n].l[1] = -1; + } + v >>= 1; + } + /* last bit */ + t[n].l[v & 1] = c->codes[i].symbol; + } + + /* Build the 16 wide lookup table per node */ + for (n = 0; n < new_node; n++) { + for (j = 0; j < 16; j++) { + unsigned int v = j; + int n2 = n; + h_jump4_t *hj = &J4[n][j]; + hj->nsymbols = 0; + hj->top_bit = 0; + for (i = 0; i < 4; i++) { + int b = v & 1; + if (t[n2].l[b] >= 0) { + hj->symbol[hj->nsymbols++] = t[n2].l[b]; + if (t[n2].l[b] == SYM_EOF) + if (!hj->top_bit) + hj->top_bit |= 1 << (hj->nsymbols-1); + } + n2 = t[n2].c[b]; + v >>= 1; + } + hj->jump = n2; + } + } + +#if 0 + /* Debug output */ + for (n = 0; n < new_node; n++) { + printf("Node %d, c[]={%d,%d}, l[]={%d,%d}\n", + n, t[n].c[0], t[n].c[1], t[n].l[0], t[n].l[1]); + for (i = 0; i < 256; i++) { + printf("\t%02x %s =>%02d, ", i, print_8rev(i), J4[n][i].jump); + for (k = 0; k < J4[n][i].nsymbols; k++) { + if (isprint(J4[n][i].symbol[k])) + printf(" '%c'", J4[n][i].symbol[k]); + else + printf(" %03d", J4[n][i].symbol[k]); + } + printf("\n"); + } + } +#endif + + + /* + * Decoding - part 1 + * We're part way through a byte, so decode bit by bit up to the next + * whole byte and then we start the fast decoding section. + */ + cp = out->data; + node_num = 0; + while (in->bit != 0) { + int b = get_bits(in, 1); + if (t[node_num].l[b] != -1) { + if (t[node_num].l[b] != SYM_EOF) { + *cp++ = t[node_num].l[b]; + } else { + out->byte = cp - out->data; + return out; + } + } + node_num = t[node_num].c[b]; + } + + /* + * Decoding - part 2 + * + * We handle data nibble by nibble using the nibble to get an + * h_jump4_t lookup from the J4[] table. + * If top_bit is clear then we know we have no funny business (SYM_EOF) + * so we use a fast decoding technique, otherwise we have to do a slower + * loop with a check. + */ + { + int last_node = 0; + unsigned char *last_cp = cp; + h_jump4_t *x = &J4[node_num][in->data[in->byte] & 0x0f]; + int l = x->nsymbols; + int b; + + /* + * This is the tight loop, so we over-optimise here by ignoring EOF + * and relying on knowing the length of the input data stream. + * This allows us to ignore the 9-bit data and only operate on + * the basic 0-255 symbols, glossing over the minor issue that EOF + * will look like an ordinary symbol. + */ + for (i = in->byte; i < in->alloc; i++) { + last_cp = cp; + last_node = node_num; + + x = &J4[node_num][in->data[i] & 0x0f]; + l = x->nsymbols; + + for (j = 0; j < l; j++) { + *cp++ = x->symbol[j]; + } + node_num = x->jump; + + if (x->top_bit) + break; + + x = &J4[node_num][(in->data[i] >> 4) & 0x0f]; + l = x->nsymbols; + + for (j = 0; j < l; j++) { + *cp++ = x->symbol[j]; + } + node_num = x->jump; + + if (x->top_bit) + break; + } + + + /* + * Decoding - part 3 + * + * The above optimisation has unfortunately added EOF to our data + * along with whatever else was packed in the last byte after the + * EOF symbol. So we rewind one byte and finish off decoding + * the slow way - walking the tree. + */ + cp = last_cp; + node_num = last_node; + in->byte = i; + in->bit = 0; + while (-1 != (b = get_bits(in, 1))) { + if (t[node_num].l[b] != -1) { + if (t[node_num].l[b] != SYM_EOF) { + *cp++ = t[node_num].l[b]; + } else { + out->byte = cp - out->data; + return out; + } + } + node_num = t[node_num].c[b]; + } + } + + + /* We shouldn't reach here */ + return NULL; +} + +int init_decode_tables(huffman_codeset_t *cs) { + int nnodes, i, j, n, nc; + huffman_codes_t **c; + int new_node, rec; + h_jump4_t (*J4)[16] = NULL; + htree_t *t; + + c = cs->codes; + nc = cs->ncodes; + + /* Allocate memory for internal nodes (nsyms-1 for each code set) */ + for (nnodes = i = 0; i < nc; i++) { + nnodes += c[i]->ncodes-1; + } + + if (NULL == (t = (htree_t *)malloc(nnodes * sizeof(*t)))) + goto error; + + if (NULL == (J4 = (h_jump4_t (*)[16])malloc(nnodes * sizeof(*J4)))) + goto error; + + /* + * Construct the tree from the codes. + * We have one tree for all 'nc' huffman codes with each tree pointing + * to the root of the next one (or first) tree whenever we emit a + * symbol. + * + * This then effectively means the decoding step is identical to the + * single huffman code function. + */ + new_node = 0; + for (rec = 0; rec < nc; rec++) { + int root = new_node++; + int next_root = rec == nc-1 + ? 0 + : root + c[rec]->ncodes-1; + + t[root].l[0] = t[root].l[1] = -1; + t[root].c[0] = t[root].c[1] = next_root; + for (i = 0; i < c[rec]->ncodes; i++) { + int n = root; + unsigned int v = c[rec]->codes[i].code; + + for (j = 0; j < c[rec]->codes[i].nbits-1; j++) { + int b = v & 1; + if (t[n].c[b] != next_root) { + n = t[n].c[b]; + } else { + n = (t[n].c[b] = new_node++); + t[n].c[0] = t[n].c[1] = next_root; + t[n].l[0] = t[n].l[1] = -1; + } + v >>= 1; + } + /* last bit */ + t[n].l[v & 1] = c[rec]->codes[i].symbol; + } + } + + /* + for (i = 0; i < new_node; i++) { + printf("t[%d] = {(%d,%d), (%d,%d)}\n", + i, + t[i].l[0], t[i].l[1], + t[i].c[0], t[i].c[1]); + } + */ + + /* Build the 16 wide lookup table per node */ + for (n = 0; n < new_node; n++) { + for (j = 0; j < 16; j++) { + unsigned int v = j; + int n2 = n; + h_jump4_t *hj = &J4[n][j]; + hj->nsymbols = 0; + hj->top_bit = 0; + for (i = 0; i < 4; i++) { + int b = v & 1; + if (t[n2].l[b] >= 0) { + hj->symbol[hj->nsymbols++] = t[n2].l[b]; + if (t[n2].l[b] == SYM_EOF) + if (!hj->top_bit) + hj->top_bit |= 1 << (hj->nsymbols-1); + } + n2 = t[n2].c[b]; + v >>= 1; + } + hj->jump = n2; + /* + printf("J4[%d][%d] = {'%.*s', %d}\n", + n, j, hj->nsymbols, hj->symbol, n2); + */ + } + } + + cs->decode_t = t; + cs->decode_J4 = J4; + + return 0; + + error: + if (t) + free(t); + + if (J4) + free(J4); + + cs->decode_t = NULL; + cs->decode_J4 = NULL; + + return -1; +} + +/* + * The opposite of huffman_encode(). + * Decode a huffman stream from 'block' using huffman codes 'c'. + * + * Returns: allocated block_t pointer on success + * NULL on failure. + * + * Method 1 + * -------- + * + * At any node in our tree we can precompute a lookup table so that upon + * reading the next 'k' bits we know the new node we'd end up in and what + * symbols to export. + * Then decoding simply works in fixed sets of k bits at a time. + * + * We use k=4 for efficient table space (they fit neatly in cache) and ease + * of decoding 4-bits at a time. k=8 is about 20% faster as reading the input + * byte by byte is easy, but the setup time is substantially longer + * (16x at a guess) and the lookup tables no longer fit in the L1 cache. + * + * NB: This version also handles multiple interleaved huffman codes as + * this support doesn't really slow down the decoding process. + */ +block_t *huffman_multi_decode(block_t *in, huffman_codeset_t *cs) { + block_t *out = NULL; + int i, j; + int node_num; + unsigned char *cp; + huffman_codes_t **c; + int nc; + h_jump4_t (*J4)[16]; + htree_t *t; + + if (!cs) + return NULL; + c = cs->codes; + nc = cs->ncodes; + + /* Ensure precomputed lookup tables exist */ + if (!cs->decode_t || !cs->decode_J4) + if (-1 == init_decode_tables(cs)) + return NULL; + + t = cs->decode_t; + J4 = cs->decode_J4; + + if (NULL == (out = block_create(NULL, 9*(in->alloc+1)))) { + goto error; + } + + /* + * Decoding - part 1 + * We're part way through a byte, so decode bit by bit up to the next + * whole byte and then we start the fast decoding section. + */ + cp = out->data; + node_num = 0; + while (in->bit != 0) { + int b = get_bits(in, 1); + htree_t *t2 = &t[node_num]; + if (t2->l[b] != -1) { + if (t2->l[b] != SYM_EOF) { + *cp++ = t2->l[b]; + } else { + out->byte = cp - out->data; + goto success; + } + } + node_num = t2->c[b]; + } + + /* + * Decoding - part 2 + * + * We now handle data nibble by nibble using the nibble to get an + * h_jump4_t lookup from the J4[] table. + * If top_bit is clear then we know we have no funny business (SYM_EOF) + * so we use a fast decoding technique, otherwise we have to do a slower + * loop with a check. + */ + { + int last_node = node_num; + unsigned char *last_cp = cp; + h_jump4_t *x = &J4[node_num][in->data[in->byte] & 0x0f]; + int l = x->nsymbols; + int b; + + /* + * This is the tight loop, so we over-optimise here by ignoring EOF + * and relying on knowing the length of the input data stream. + * This allows us to ignore the 9-bit data and only operate on + * the basic 0-255 symbols, glossing over the minor issue that EOF + * will look like an ordinary symbol. + */ + for (i = in->byte; i < in->alloc; i++) { + last_cp = cp; + last_node = node_num; + + x = &J4[node_num][in->data[i] & 0x0f]; + l = x->nsymbols; + + /* printf("val=%d\n", in->data[i] & 0x0f); */ + for (j = 0; j < l; j++) { + *cp++ = x->symbol[j]; + } + node_num = x->jump; + + if (x->top_bit) + break; + + x = &J4[node_num][(in->data[i] >> 4) & 0x0f]; + l = x->nsymbols; + + for (j = 0; j < l; j++) { + *cp++ = x->symbol[j]; + } + node_num = x->jump; + + if (x->top_bit) + break; + } + + + /* + * Decoding - part 3 + * + * The above optimisation has unfortunately added EOF to our data + * along with whatever else was packed in the last byte after the + * EOF symbol. So we rewind one byte and finish off decoding + * the slow way - walking the tree. + */ + cp = last_cp; + node_num = last_node; + in->byte = i; + in->bit = 0; + while (-1 != (b = get_bits(in, 1))) { + htree_t *t2 = &t[node_num]; + if (t2->l[b] != -1) { + if (t2->l[b] != SYM_EOF) { + *cp++ = t2->l[b]; + } else { + out->byte = cp - out->data; + goto success; + } + } + node_num = t2->c[b]; + } + } + + success: + return out; + + error: + if (out) + block_destroy(out, 0); + + return NULL; +} + +#if 0 +/* A simple to understand (but slow) version of the above function */ +block_t *huffman_multi_decode(block_t *in, huffman_codes_t **c, int nc) { + block_t *out; + htree_t (*t)[513]; /* number of internal nodes */ + int i, j, n, rec; + int new_node, node_num; + unsigned char *cp; + int bC; /* byte count */ + + if (NULL == (t = (htree_t (*)[513])malloc(nc * sizeof(*t)))) + return NULL; + + if (NULL == (out = block_create(NULL, 8*in->alloc+8))) { + block_destroy(in, 0); + free(t); + return NULL; + } + + /* Construct the tree from the codes */ + for (rec = 0; rec < nc; rec++) { + new_node = 1; + t[rec][0].l[0] = t[rec][0].l[1] = -1; + t[rec][0].c[0] = t[rec][0].c[1] = 0; + for (i = 0; i < c[rec]->ncodes; i++) { + int n = 0; + unsigned int v = c[rec]->codes[i].code; + + for (j = 0; j < c[rec]->codes[i].nbits-1; j++) { + int b = v & 1; + if (t[rec][n].c[b]) { + n = t[rec][n].c[b]; + } else { + n = (t[rec][n].c[b] = new_node++); + t[rec][n].c[0] = t[rec][n].c[1] = 0; + t[rec][n].l[0] = t[rec][n].l[1] = -1; + } + v >>= 1; + } + /* last bit */ + t[rec][n].l[v & 1] = c[rec]->codes[i].symbol; + } + } + + /* + * Decoding - the slow way. How to speed up multi-code decoding? + */ + cp = out->data; + node_num = 0; + bC = 0; + while (in->byte < in->alloc) { + htree_t *t2; + int b = get_bits(in, 1); + t2 = &t[bC%nc][node_num]; + if (t2->l[b] != -1) { + if (t2->l[b] != SYM_EOF) { + *cp++ = t2->l[b]; + bC++; + node_num = 0; + } else { + out->byte = cp - out->data; + free(t); + return out; + } + } else { + node_num = t2->c[b]; + } + } + + /* We shouldn't reach here */ + free(t); + return NULL; +} +#endif + +/* + * A slow version of the above huffman_decode function. This is designed as + * a piecemeal decoder for purposes of restoring the huffman codes themselves. + * NB: this only works for code lengths small enough to keep inside the + * htab[] dimensions - IT DOES NOT CHECK THIS. + * + * Returns the next symbol + * -1 for failure + */ +int next_symbol(block_t *in, int *htab) { + int b, v = 0, c = 1; + while ((b = get_bits(in, 1)) != -1) { + v = (v<<1) | b | (c <<= 1); + + if (htab[v] != -1) + return htab[v]; + } + + return -1; +} + + +/* + * --------------------------------------------------------------------------- + * Debug code. This turns the library into a stand-alone program for + * easy debugging.x + */ +static void output_code_set(FILE *fp, huffman_codes_t *cds) { + int i, j; + int nbits_in = 0, nbits_out = 0; + huffman_code_t *codes = cds->codes; + int ncodes = cds->ncodes; + + fprintf(fp, "static huffman_code_t codes_FIXME[] = {\n"); + for (i = j = 0; i < ncodes; i++) { + nbits_out += codes[i].nbits * codes[i].freq; + nbits_in += 8*codes[i].freq; + if (j == 0) + fprintf(fp, " "); + if (codes[i].symbol == SYM_EOF) { + fprintf(fp, "{SYM_EOF,%3d}, ", codes[i].nbits); + j = 10; + } else { + if (isalnum(codes[i].symbol)) { + fprintf(fp, "{'%c',%3d}, ", codes[i].symbol, codes[i].nbits); + } else { + fprintf(fp, "{%3d,%3d}, ", codes[i].symbol, codes[i].nbits); + } + } + j++; + + if (j >= 6) { + fputc('\n', fp); + j = 0; + } + } + if (j) + fputc('\n', fp); + fprintf(fp, "};\n"); + fprintf(fp, "/* Expected compression to %f of input */\n", + (double)nbits_out/nbits_in); +} + +static void output_code_set2(FILE *fp, huffman_codes_t *cds) { + int i; + huffman_code_t *codes = cds->codes; + int ncodes = cds->ncodes; + + fprintf(fp, "huffman_code_t = {\n"); + for (i = 0; i < ncodes; i++) { + fprintf(fp, "\t%d:\t%3d %c %2d %04x %d\n", + i,codes[i].symbol, codes[i].symbol, + codes[i].nbits, codes[i].code, + codes[i].freq); + } + fprintf(fp, "};\n"); +} + + +/* + * -------------------------------------------------------------------------- + * A test main() to create an application capable of compressing and + * uncompressing stdin. + */ + +#ifdef TEST_MAIN +#include <fcntl.h> +/* #include <zlib.h> */ + +/* + * Slurps the entirety of stdin into a malloced buffer and returns a pointer + * to it. + * + * Returns: malloced buffer on success, *lenp equal to length + * NULL on failure + */ +static unsigned char *load(int *lenp, char *fn) { + unsigned char *data = NULL; + int dsize = 0; + int dcurr = 0, len; + int fd = 0; + + if (fn) { + if (-1 == (fd = open(fn, O_RDONLY, 0))) { + perror(fn); + return NULL; + } + } + + do { + if (dsize - dcurr < 8192) { + dsize = dsize ? dsize * 2 : 8192; + if (NULL == (data = realloc(data, dsize))) { + if (fd) + close(fd); + return NULL; + } + } + + len = read(fd, data + dcurr, 8192); + if (len > 0) + dcurr += len; + } while (len > 0); + + if (len == -1) { + perror("read"); + if (fd) + close(fd); + return NULL; + } + + if (fd) + close(fd); + + *lenp = dcurr; + return data; +} + +/* + * Returns 0 for success + * -1 for failure. + */ +int decode_main(unsigned char *data, int len, int code_set) { + huffman_codeset_t *cs = NULL; + block_t *blk_in; + unsigned char *out; + int err, out_len, i; + int bfinal; + + blk_in = block_create(NULL, 1000 + len); + + if (code_set != 0) { + cs = generate_code_set(code_set, 1, /* no. codes */ + NULL, 0, /* data + size */ + 1, /* eof */ + MAX_CODE_LEN, + 0); /* all_codes */ + store_codes(blk_in, cs, 1); + } + + if (blk_in->bit != 0) { + blk_in->data[blk_in->byte] |= data[0]; + memcpy(&blk_in->data[blk_in->byte+1], data+1, len-1); + } else { + memcpy(&blk_in->data[blk_in->byte], data, len); + } + + /* Do the decoding */ + do { + block_t *out; + if (!cs) + cs = restore_codes(blk_in, &bfinal); + out = huffman_multi_decode(blk_in, cs); + write(1, out->data, out->byte); + block_destroy(out, 0); + huffman_codeset_destroy(cs); + cs = NULL; + } while (!bfinal); + + block_destroy(blk_in, 0); + + return 0; +} + +/* + * Returns 0 for success + * -1 for failure. + */ +int encode_main(unsigned char *data, int len, int code_set, int rec_size, + int blk_size, int dump_tree, int exit_after_tree) { + /* Encoding */ + unsigned char *d2 = data; + block_t *blk; + huffman_codeset_t *cs; + + blk = block_create(NULL, 8192); + fprintf(stderr, "Input %d bytes\n", len); + + do { + int l2 = len > blk_size ? blk_size : len; + int i; + + if (code_set != 0) + l2 = len; /* predefined code-sets have final-block bit set */ + cs = generate_code_set(code_set, rec_size, + d2, l2, /* Data and length */ + 1, /* eof */ + MAX_CODE_LEN, + 0); /* all codes */ + if (!cs) + return -1; + + if (dump_tree) { + for (i = 0; i < rec_size; i++) { + printf("==Sub-set %d==\n", i); + output_code_set(stdout, cs->codes[i]); + /* output_code_set2(stdout, cs->codes[i]); */ + } + if (exit_after_tree) + return 0; + } + + store_codes(blk, cs, l2 == len); + if (code_set != 0) { + blk->data[blk->byte = 0] = 0; /* starting bit no. preseved */ + } else { + /* + fprintf(stderr, "codes finished at %d bytes, %d bits\n", + blk->byte, blk->bit); + */ + } + if (exit_after_tree) { + write(1, blk->data, blk->byte + (blk->bit != 0)); + return 0; + } + + huffman_multi_encode(blk, cs, code_set, d2, l2); + + huffman_codeset_destroy(cs); + len -= l2; + d2 += l2; + + } while (len > 0); + write(1, blk->data, blk->byte + (blk->bit != 0)); + + fprintf(stderr, "Output %ld bytes\n", blk->byte + (blk->bit != 0)); + block_destroy(blk, 0); + + return 0; +} + +int main(int argc, char **argv) { + unsigned char *data; + int decode = 0; + int dump_tree = 0; + int exit_after_tree = 0; + int code_set = CODE_INLINE; + int blk_size = 0x7fff; + int rec_size = 1; + int c, r, len; + char *fn = NULL; + + while ((c = getopt(argc, argv, "c:detxl:b:hr:i:")) != -1) { + switch (c) { + case 'b': + blk_size = atoi(optarg); + break; + + case 'c': + code_set = atoi(optarg); + break; + + case 'r': + rec_size = atoi(optarg); + break; + + case 'd': + decode = 1; + break; + + case 'e': + decode = 0; + break; + + case 't': + dump_tree = 1; + break; + + case 'x': + exit_after_tree = 1; + break; + + case 'i': + fn = optarg; + break; + + default: + fprintf(stderr, "Usage: huffman_static [options] < stdin > stdout\n"); + fprintf(stderr, " Decoding options\n"); + fprintf(stderr, " -d\tdecode flag\n"); + fprintf(stderr, " Encoding options\n"); + fprintf(stderr, " -e\tencode flag\n"); + fprintf(stderr, " -b size\tspecify the block-size\n"); + fprintf(stderr, " -c code\tspecify code-set. 0 => inline\n"); + fprintf(stderr, " -t\tpretty-print the code-set used\n"); + fprintf(stderr, " -x\texit after outputting code-set\n"); + exit(c != 'h'); + } + } + + data = load(&len, fn); + + r = decode + ? decode_main(data, len, code_set) + : encode_main(data, len, code_set, rec_size, + blk_size, dump_tree, exit_after_tree); + + free(data); + + return r == 0 ? 0 : 1; +} +#endif