Mercurial > repos > dawe > srf2fastq
diff srf2fastq/io_lib-1.12.2/progs/ztr_dump.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/srf2fastq/io_lib-1.12.2/progs/ztr_dump.c Tue Jun 07 17:48:05 2011 -0400 @@ -0,0 +1,217 @@ +#include <stdio.h> +#include <errno.h> +#include <math.h> +#include <io_lib/ztr.h> +#include <io_lib/compression.h> +#include <io_lib/xalloc.h> + +static char *format2str(int format) { + static char unk[100]; + + switch (format) { + case ZTR_FORM_RAW: return "raw"; + case ZTR_FORM_RLE: return "rle"; + case ZTR_FORM_XRLE: return "xrle"; + case ZTR_FORM_XRLE2: return "xrle2"; + case ZTR_FORM_ZLIB: return "zlib"; + case ZTR_FORM_DELTA1: return "delta1"; + case ZTR_FORM_DELTA2: return "delta2"; + case ZTR_FORM_DELTA4: return "delta4"; + case ZTR_FORM_DDELTA1: return "ddelta1"; + case ZTR_FORM_DDELTA2: return "ddelta2"; + case ZTR_FORM_DDELTA4: return "ddelta4"; + case ZTR_FORM_16TO8: return "16to8"; + case ZTR_FORM_32TO8: return "32to8"; + case ZTR_FORM_FOLLOW1: return "follow1"; + case ZTR_FORM_CHEB445: return "cheb445"; + case ZTR_FORM_ICHEB: return "icheb"; + case ZTR_FORM_LOG2: return "log2"; + case ZTR_FORM_STHUFF: return "sthuff"; + case ZTR_FORM_QSHIFT: return "qshift"; + case ZTR_FORM_TSHIFT: return "tshift"; + } + + sprintf(unk, "?%d?\n", format); + return unk; +} + +/* + * Shannon showed that for storage in base 'b' with alphabet symbols 'a' having + * a probability of ocurring in any context of 'Pa' we should encode + * symbol 'a' to have a storage width of -logb(Pa). + * + * Eg. b = 26, P(e) = .22. => width .4647277. + * + * We use this to calculate the entropy of a signal by summing over all letters + * in the signal. In this case, our storage has base 256. + */ +#define EBASE 256 +double entropy(unsigned char *data, int len) { + double E[EBASE]; + double P[EBASE]; + double e; + int i; + + for (i = 0; i < EBASE; i++) + P[i] = 0; + + for (i = 0; i < len; i++) + P[data[i]]++; + + for (i = 0; i < EBASE; i++) { + if (P[i]) { + P[i] /= len; + E[i] = -(log(P[i])/log(EBASE)); + } else { + E[i] = 0; + } + } + + for (e = i = 0; i < len; i++) + e += E[data[i]]; + + return e; +} + +/* Debug version of the ztr.c uncompress_chunk function. */ +static int explode_chunk(ztr_t *ztr, ztr_chunk_t *chunk) { + char *new_data = NULL; + int new_len; + + while (chunk->dlength > 0 && chunk->data[0] != ZTR_FORM_RAW) { + double ent = entropy((unsigned char *)chunk->data, chunk->dlength); + + switch (chunk->data[0]) { + case ZTR_FORM_RLE: + new_data = unrle(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_XRLE: + new_data = unxrle(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_XRLE2: + new_data = unxrle2(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_ZLIB: + new_data = zlib_dehuff(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_DELTA1: + new_data = recorrelate1(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_DELTA2: + new_data = recorrelate2(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_DELTA4: + new_data = recorrelate4(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_16TO8: + new_data = expand_8to16(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_32TO8: + new_data = expand_8to32(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_FOLLOW1: + new_data = unfollow1(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_ICHEB: + new_data = ichebuncomp(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_LOG2: + new_data = unlog2_data(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_STHUFF: + new_data = unsthuff(ztr, chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_QSHIFT: + new_data = unqshift(chunk->data, chunk->dlength, &new_len); + break; + + case ZTR_FORM_TSHIFT: + new_data = untshift(ztr, chunk->data, chunk->dlength, &new_len); + break; + + default: + fprintf(stderr, "Unknown encoding format %d\n", chunk->data[0]); + return -1; + } + + if (!new_data) { + fprintf(stderr, "Failed to decode chunk with format %s\n", + format2str(chunk->data[0])); + return -1; + } + + printf(" format %8s => %6d to %6d, entropy %8.1f to %8.1f\n", + format2str(chunk->data[0]), chunk->dlength, new_len, + ent, entropy((unsigned char *)new_data, new_len)); + + chunk->dlength = new_len; + xfree(chunk->data); + chunk->data = new_data; + } + + return 0; +} + +int main(int argc, char **argv) { + ztr_t *ztr; + mFILE *fp; + int i; + + if (argc >= 2) { + if (NULL == (fp = mfopen(argv[1], "rb"))) { + perror(argv[1]); + return 1; + } + } else { + fp = mstdin(); + } + + if (NULL == (ztr = mfread_ztr(fp))) { + perror("fread_ztr"); + return 1; + } + + printf("Nchunks = %d\n", ztr->nchunks); + for (i = 0; i < ztr->nchunks; i++) { + char str[5]; + int complen; + int rawlen; + char *val; + + (void)ZTR_BE2STR(ztr->chunk[i].type, str); + complen = ztr->chunk[i].dlength; + val = ztr_lookup_mdata_value(ztr, &ztr->chunk[i], "TYPE"); + if (val) + printf("-- %s (%s) --\n", str, val); + else + printf("-- %s --\n", str); + explode_chunk(ztr, &ztr->chunk[i]); + rawlen = ztr->chunk[i].dlength; + printf("SUMMARY %s mlen %3d, dlen %6d, rawlen %6d, ratio %f\n", + str, ztr->chunk[i].mdlength, + complen, rawlen, (double)complen/rawlen); +#if 0 + fflush(stdout); + puts("\n========================================"); + write(1, ztr->chunk[i].data, ztr->chunk[i].dlength); + puts("\n========================================"); +#endif + } + + delete_ztr(ztr); + + return 0; +}