comparison srf2fastq/io_lib-1.12.2/io_lib/ztr.c @ 0:d901c9f41a6a default tip

Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author dawe
date Tue, 07 Jun 2011 17:48:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d901c9f41a6a
1 #include <stdio.h>
2 #include <string.h>
3 #include <math.h>
4
5 /* #include <fcntl.h> */
6
7 #include "io_lib/ztr.h"
8 #include "io_lib/xalloc.h"
9 #include "io_lib/Read.h"
10 #include "io_lib/compression.h"
11 #include "io_lib/stdio_hack.h"
12 #include "io_lib/deflate_interlaced.h"
13
14 /* Deprecated #define - see solexa2srf for the more up to date code */
15 /* #define ILLUMINA_GA */
16
17 /*
18 * ---------------------------------------------------------------------------
19 * Trace writing functions.
20 * These consist of several encoding functions, all with the same prototype,
21 * and a single fwrite_ztr function to wrap it all up.
22 * ---------------------------------------------------------------------------
23 */
24
25 /*
26 * ztr_write_header
27 *
28 * Writes a ZTR file header.
29 *
30 * Arguments:
31 * fp A FILE pointer
32 * h A pointer to the header to write
33 *
34 * Returns:
35 * Success: 0
36 * Failure: -1
37 */
38 static int ztr_write_header(FILE *fp, ztr_header_t *h) {
39 if (1 != fwrite(h, sizeof(*h), 1, fp))
40 return -1;
41
42 return 0;
43 }
44
45 /*
46 * ztr_write_chunk
47 *
48 * Writes a ZTR chunk including chunk header and data
49 *
50 * Arguments:
51 * fp A FILE pointer
52 * chunk A pointer to the chunk to write
53 *
54 * Returns:
55 * Success: 0
56 * Failure: -1
57 */
58 static int ztr_write_chunk(FILE *fp, ztr_chunk_t *chunk) {
59 int4 bei4;
60
61 /*
62 {
63 char str[5];
64 fprintf(stderr, "Write chunk %.4s %08x length %d\n",
65 ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength);
66 }
67 */
68
69 /* type */
70 bei4 = be_int4(chunk->type);
71 if (1 != fwrite(&bei4, 4, 1, fp))
72 return -1;
73
74 /* metadata length */
75 bei4 = be_int4(chunk->mdlength);
76 if (1 != fwrite(&bei4, 4, 1, fp))
77 return -1;
78
79 /* metadata */
80 if (chunk->mdlength)
81 if (chunk->mdlength != fwrite(chunk->mdata, 1, chunk->mdlength, fp))
82 return -1;
83
84 /* data length */
85 bei4 = be_int4(chunk->dlength);
86 if (1 != fwrite(&bei4, 4, 1, fp))
87 return -1;
88
89 /* data */
90 if (chunk->dlength != fwrite(chunk->data, 1, chunk->dlength, fp))
91 return -1;
92
93 return 0;
94 }
95
96 /*
97 * fwrite_ztr
98 *
99 * Writes a ZTR file held in the ztr_t structure.
100 * It is assumed that all the correct lengths, magic numbers, etc in the
101 * ztr_t struct have already been initialised correctly.
102 *
103 * FIXME: Add a 'method' argument which encodes formats? Perhaps store this
104 * in the ztr struct?
105 *
106 * Arguments:
107 * fp A writable FILE pointer
108 * ztr A pointer to the ztr_t struct to write.
109 *
110 * Returns:
111 * Success: 0
112 * Failure: -1
113 */
114 int fwrite_ztr(FILE *fp, ztr_t *ztr) {
115 int i;
116
117 /* Write the header record */
118 if (-1 == ztr_write_header(fp, &ztr->header))
119 return -1;
120
121 /* Write the chunks */
122 for (i = 0; i < ztr->nchunks; i++) {
123 if (-1 == ztr_write_chunk(fp, &ztr->chunk[i]))
124 return -1;
125 #if 0
126 {
127 int fd;
128 char fname[1024];
129 sprintf(fname, "chunk.%d", i);
130 fd = open(fname, O_RDWR|O_CREAT|O_TRUNC, 0666);
131 write(fd, ztr->chunk[i].data, ztr->chunk[i].dlength);
132 close(fd);
133 }
134 #endif
135 }
136
137 return 0;
138 }
139
140 /*
141 * ---------------------------------------------------------------------------
142 * Trace reading functions.
143 * These consist of several decoding functions, all with the same prototype,
144 * and a single fread_ztr function to wrap it all up.
145 * ---------------------------------------------------------------------------
146 */
147
148 /*
149 * ztr_read_header
150 *
151 * Reads a ZTR file header.
152 *
153 * Arguments:
154 * fp A FILE pointer
155 * h Where to write the header to
156 *
157 * Returns:
158 * Success: 0
159 * Failure: -1
160 */
161 int ztr_read_header(FILE *fp, ztr_header_t *h) {
162 if (1 != fread(h, sizeof(*h), 1, fp))
163 return -1;
164
165 return 0;
166 }
167
168 /*
169 * ztr_read_chunk_hdr
170 *
171 * Reads a ZTR chunk header and metadata, but not the main data segment.
172 *
173 * Arguments:
174 * fp A FILE pointer
175 *
176 * Returns:
177 * Success: a chunk pointer (malloced)
178 * Failure: NULL
179 */
180 ztr_chunk_t *ztr_read_chunk_hdr(FILE *fp) {
181 int4 bei4;
182 ztr_chunk_t *chunk;
183
184 if (NULL == (chunk = (ztr_chunk_t *)xmalloc(sizeof(*chunk))))
185 return NULL;
186
187 /* type */
188 if (1 != fread(&bei4, 4, 1, fp)) {
189 xfree(chunk);
190 return NULL;
191 }
192 chunk->type = be_int4(bei4);
193
194 /* metadata length */
195 if (1 != fread(&bei4, 4, 1, fp)) {
196 xfree(chunk);
197 return NULL;
198 }
199 chunk->mdlength = be_int4(bei4);
200
201 /* metadata */
202 chunk->ztr_owns = 1;
203 if (chunk->mdlength) {
204 if (NULL == (chunk->mdata = (char *)xmalloc(chunk->mdlength))) {
205 xfree(chunk);
206 return NULL;
207 }
208 if (chunk->mdlength != fread(chunk->mdata, 1, chunk->mdlength, fp)) {
209 xfree(chunk->mdata);
210 xfree(chunk);
211 return NULL;
212 }
213 } else {
214 chunk->mdata = NULL;
215 }
216
217 /* data length */
218 if (1 != fread(&bei4, 4, 1, fp)) {
219 if (chunk->mdata)
220 xfree(chunk->mdata);
221 xfree(chunk);
222 return NULL;
223 }
224 chunk->dlength = be_int4(bei4);
225
226 return chunk;
227 }
228
229 void ztr_process_text(ztr_t *ztr) {
230 int i;
231 ztr_chunk_t **text_chunks = NULL;
232 int ntext_chunks = 0;
233 ztr_text_t *zt = NULL;
234 int nzt = 0;
235 int nalloc = 0;
236
237 if (ztr->text_segments)
238 /* Already done */
239 return;
240
241 text_chunks = ztr_find_chunks(ztr, ZTR_TYPE_TEXT, &ntext_chunks);
242 if (!text_chunks)
243 return;
244
245 for (i = 0; i < ntext_chunks; i++) {
246 char *data;
247 uint4 length;
248 char *ident, *value;
249
250 /* Make sure it's not compressed */
251 uncompress_chunk(ztr, text_chunks[i]);
252
253 data = text_chunks[i]->data;
254 length = text_chunks[i]->dlength;
255
256 if (!length)
257 continue;
258
259 /* Skip RAW header byte */
260 data++;
261 length--;
262
263 while (data - text_chunks[i]->data <= (ptrdiff_t)length &&
264 *(ident = data)) {
265 data += strlen(ident)+1;
266 value = data;
267 if (value)
268 data += strlen(value)+1;
269
270 if (nzt + 1 > nalloc) {
271 nalloc += 10;
272 zt = (ztr_text_t *)xrealloc(zt, nalloc * sizeof(*zt));
273 }
274 zt[nzt].ident = ident;
275 zt[nzt].value = value;
276 nzt++;
277 }
278 }
279
280 ztr->text_segments = zt;
281 ztr->ntext_segments = nzt;
282
283 /*
284 for (i = 0; i < ztr->ntext_segments; i++) {
285 fprintf(stderr, "'%s' = '%s'\n",
286 ztr->text_segments[i].ident,
287 ztr->text_segments[i].value);
288 }
289 */
290
291 xfree(text_chunks);
292 }
293
294
295 /*
296 * fread_ztr
297 *
298 * Reads a ZTR file from 'fp'. This checks for the correct magic number and
299 * major version number, but not minor version number.
300 *
301 * FIXME: Add automatic uncompression?
302 *
303 * Arguments:
304 * fp A readable FILE pointer
305 *
306 * Returns:
307 * Success: Pointer to a ztr_t structure (malloced)
308 * Failure: NULL
309 */
310 ztr_t *fread_ztr(FILE *fp) {
311 ztr_t *ztr;
312 ztr_chunk_t *chunk;
313 int sections = read_sections(0);
314
315 /* Allocate */
316 if (NULL == (ztr = new_ztr()))
317 return NULL;
318
319 /* Read the header */
320 if (-1 == ztr_read_header(fp, &ztr->header))
321 return NULL;
322
323 /* Check magic number and version */
324 if (memcmp(ztr->header.magic, ZTR_MAGIC, 8) != 0)
325 return NULL;
326
327 if (ztr->header.version_major != ZTR_VERSION_MAJOR)
328 return NULL;
329
330 /* Load chunks */
331 while (chunk = ztr_read_chunk_hdr(fp)) {
332 /*
333 char str[5];
334
335 fprintf(stderr, "Read chunk %.4s %08x length %d\n",
336 ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength);
337 */
338 switch(chunk->type) {
339 case ZTR_TYPE_HEADER:
340 /* End of file */
341 return ztr;
342
343 case ZTR_TYPE_SAMP:
344 case ZTR_TYPE_SMP4:
345 if (! (sections & READ_SAMPLES)) {
346 fseek(fp, chunk->dlength, SEEK_CUR);
347 xfree(chunk);
348 continue;
349 }
350 break;
351
352
353 case ZTR_TYPE_BASE:
354 case ZTR_TYPE_BPOS:
355 case ZTR_TYPE_CNF4:
356 case ZTR_TYPE_CNF1:
357 case ZTR_TYPE_CSID:
358 if (! (sections & READ_BASES)) {
359 fseek(fp, chunk->dlength, SEEK_CUR);
360 xfree(chunk);
361 continue;
362 }
363 break;
364
365 case ZTR_TYPE_TEXT:
366 if (! (sections & READ_COMMENTS)) {
367 fseek(fp, chunk->dlength, SEEK_CUR);
368 xfree(chunk);
369 continue;
370 }
371 break;
372
373 case ZTR_TYPE_CLIP:
374 case ZTR_TYPE_FLWO:
375 case ZTR_TYPE_FLWC:
376 break;
377
378 /*
379 default:
380 fprintf(stderr, "Unknown chunk type '%s': skipping\n",
381 ZTR_BE2STR(chunk->type,str));
382 fseek(fp, chunk->dlength, SEEK_CUR);
383 xfree(chunk);
384 continue;
385 */
386 }
387
388 chunk->ztr_owns = 1;
389 chunk->data = (char *)xmalloc(chunk->dlength);
390 if (chunk->dlength != fread(chunk->data, 1, chunk->dlength, fp)) {
391 delete_ztr(ztr);
392 return NULL;
393 }
394
395 ztr->nchunks++;
396 ztr->chunk = (ztr_chunk_t *)xrealloc(ztr->chunk, ztr->nchunks *
397 sizeof(ztr_chunk_t));
398 memcpy(&ztr->chunk[ztr->nchunks-1], chunk, sizeof(*chunk));
399 xfree(chunk);
400 }
401
402 return ztr;
403 }
404
405 /*
406 * ---------------------------------------------------------------------------
407 * Other utility functions
408 * ---------------------------------------------------------------------------
409 */
410 /*
411 * new_ztr
412 *
413 * Allocates and initialises a ztr_t structure
414 *
415 * Returns:
416 * ztr_t pointer on success
417 * NULL on failure
418 */
419 ztr_t *new_ztr(void) {
420 ztr_t *ztr;
421
422 /* Allocate */
423 if (NULL == (ztr = (ztr_t *)xmalloc(sizeof(*ztr))))
424 return NULL;
425
426 ztr->chunk = NULL;
427 ztr->nchunks = 0;
428 ztr->text_segments = NULL;
429 ztr->ntext_segments = 0;
430 ztr->delta_level = 3;
431
432 ztr->nhcodes = 0;
433 ztr->hcodes = NULL;
434 ztr->hcodes_checked = 0;
435
436 return ztr;
437 }
438
439 void delete_ztr(ztr_t *ztr) {
440 int i;
441
442 if (!ztr)
443 return;
444
445 if (ztr->chunk) {
446 for (i = 0; i < ztr->nchunks; i++) {
447 if (ztr->chunk[i].data && ztr->chunk[i].ztr_owns)
448 xfree(ztr->chunk[i].data);
449 if (ztr->chunk[i].mdata && ztr->chunk[i].ztr_owns)
450 xfree(ztr->chunk[i].mdata);
451 }
452 xfree(ztr->chunk);
453 }
454
455 if (ztr->hcodes) {
456 for (i = 0; i < ztr->nhcodes; i++) {
457 if (ztr->hcodes[i].codes && ztr->hcodes[i].ztr_owns)
458 huffman_codeset_destroy(ztr->hcodes[i].codes);
459 }
460 free(ztr->hcodes);
461 }
462
463 if (ztr->text_segments)
464 xfree(ztr->text_segments);
465
466 xfree(ztr);
467 }
468
469 /*
470 * ztr_find_chunks
471 *
472 * Searches for chunks of a specific type.
473 *
474 * Returns:
475 * Array of ztr_chunk_t pointers (into the ztr struct). This is
476 * allocated by malloc and it is the callers duty to free this.
477 * NULL if none found.
478 */
479 ztr_chunk_t **ztr_find_chunks(ztr_t *ztr, uint4 type, int *nchunks_p) {
480 ztr_chunk_t **chunks = NULL;
481 int nchunks = 0;
482 int i;
483
484 for (i = 0; i < ztr->nchunks; i++) {
485 if (ztr->chunk[i].type == type) {
486 chunks = (ztr_chunk_t **)xrealloc(chunks, (nchunks + 1) *
487 sizeof(*chunks));
488 chunks[nchunks++] = &ztr->chunk[i];
489 }
490 }
491 *nchunks_p = nchunks;
492 return chunks;
493 }
494
495 /*
496 * Shannon showed that for storage in base 'b' with alphabet symbols 'a' having
497 * a probability of ocurring in any context of 'Pa' we should encode
498 * symbol 'a' to have a storage width of -logb(Pa).
499 *
500 * Eg. b = 26, P(e) = .22. => width .4647277.
501 *
502 * We use this to calculate the entropy of a signal by summing over all letters
503 * in the signal. In this case, our storage has base 256.
504 */
505 #define EBASE 256
506 static double entropy(unsigned char *data, int len) {
507 double E[EBASE];
508 double P[EBASE];
509 double e;
510 int i;
511
512 for (i = 0; i < EBASE; i++)
513 P[i] = 0;
514
515 for (i = 0; i < len; i++)
516 P[data[i]]++;
517
518 for (i = 0; i < EBASE; i++) {
519 if (P[i]) {
520 P[i] /= len;
521 E[i] = -(log(P[i])/log(EBASE));
522 } else {
523 E[i] = 0;
524 }
525 }
526
527 for (e = i = 0; i < len; i++)
528 e += E[data[i]];
529
530 return e;
531 }
532
533 /*
534 * Adds a user-defined huffman_codeset_t code-set to the available code sets
535 * used by huffman_encode or huffman_decode.
536 *
537 * Note that the 'codes' memory is then "owned" by the ztr object if "ztr_owns"
538 * is true and will be deallocated when the ztr object is destroyed. Otherwise
539 * freeing the ztr object will not touch the passed in codes.
540 */
541 ztr_hcode_t *ztr_add_hcode(ztr_t *ztr, huffman_codeset_t *codes,
542 int ztr_owns) {
543 if (!codes)
544 return NULL;
545
546 ztr->hcodes = realloc(ztr->hcodes, (ztr->nhcodes+1)*sizeof(*ztr->hcodes));
547 ztr->hcodes[ztr->nhcodes].codes = codes;
548 ztr->hcodes[ztr->nhcodes].ztr_owns = ztr_owns;
549
550 return &ztr->hcodes[ztr->nhcodes++];
551 }
552
553 /*
554 * Searches through the cached huffman_codeset_t tables looking for a stored
555 * huffman code of type 'code_set'.
556 * NB: only code_sets >= CODE_USER will be stored here.
557 *
558 * Returns codes on success,
559 * NULL on failure
560 */
561 ztr_hcode_t *ztr_find_hcode(ztr_t *ztr, int code_set) {
562 int i;
563
564 if (code_set < CODE_USER)
565 return NULL; /* computed on-the-fly or use a hard-coded set */
566
567 /* Check through chunks for undecoded HUFF chunks */
568 if (!ztr->hcodes_checked) {
569 for (i = 0; i < ztr->nchunks; i++) {
570 if (ztr->chunk[i].type == ZTR_TYPE_HUFF) {
571 block_t *blk;
572 huffman_codeset_t *cs;
573 uncompress_chunk(ztr, &ztr->chunk[i]);
574 blk = block_create((unsigned char *)(ztr->chunk[i].data+2),
575 ztr->chunk[i].dlength-2);
576 cs = restore_codes(blk, NULL);
577 if (!cs) {
578 block_destroy(blk, 1);
579 return NULL;
580 }
581 cs->code_set = (unsigned char)(ztr->chunk[i].data[1]);
582 ztr_add_hcode(ztr, cs, 1);
583 block_destroy(blk, 1);
584 }
585 }
586 ztr->hcodes_checked = 1;
587 }
588
589 /* Check cached copies */
590 for (i = 0; i < ztr->nhcodes; i++) {
591 if (ztr->hcodes[i].codes->code_set == code_set)
592 return &ztr->hcodes[i];
593 }
594
595 return NULL;
596 }
597
598 ztr_chunk_t *ztr_find_hcode_chunk(ztr_t *ztr, int code_set) {
599 int i;
600
601 if (code_set < CODE_USER)
602 return NULL; /* computed on-the-fly or use a hard-coded set */
603
604 /* Check through chunks for undecoded HUFF chunks */
605 for (i = 0; i < ztr->nchunks; i++) {
606 if (ztr->chunk[i].type == ZTR_TYPE_HUFF) {
607 uncompress_chunk(ztr, &ztr->chunk[i]);
608 if (ztr->chunk[i].dlength >= 2 &&
609 (unsigned char)ztr->chunk[i].data[1] == code_set)
610 return &ztr->chunk[i];
611 }
612 }
613
614 return NULL;
615 }
616
617 /*
618 * Adds a new chunk to a ztr file and returns the chunk pointer.
619 * The data and mdata fields can be NULL and the chunk will not be
620 * initialised.
621 *
622 * Returns new chunk ptr on success.
623 * NULL on failure.
624 */
625 ztr_chunk_t *ztr_new_chunk(ztr_t *ztr, uint4 type,
626 char *data, uint4 dlength,
627 char *mdata, uint4 mdlength) {
628 ztr_chunk_t *chunks, *c;
629
630 /* Grow the chunk array */
631 chunks = (ztr_chunk_t *)realloc(ztr->chunk,
632 (ztr->nchunks+1) * sizeof(*chunks));
633 if (!chunks)
634 return NULL;
635 ztr->chunk = chunks;
636
637 /* Initialise */
638 c = &ztr->chunk[ztr->nchunks++];
639 c->type = type;
640 c->data = data;
641 c->dlength = dlength;
642 c->mdata = mdata;
643 c->mdlength = mdlength;
644 c->ztr_owns = 1;
645
646 return c;
647 }
648
649 /*
650 * Adds a key/value pair to a ztr TEXT chunk.
651 * The 'ch' chunk may be explicitly specified in which case the text
652 * is added to that chunk or it may be specified as NULL in which case
653 * the key/value pair are added to the first available TEXT chunk,
654 * possibly creating a new one if required.
655 *
656 * NOTE: If the key already exists in the text chunk this appends a new
657 * copy; it does not overwrite the old one.
658 *
659 * Returns ztr text chunk ptr for success
660 * NULL for failure
661 */
662 ztr_chunk_t *ztr_add_text(ztr_t *z, ztr_chunk_t *ch,
663 const char *key, const char *value) {
664 ztr_chunk_t **text_chunks = NULL;
665 int ntext_chunks;
666 size_t key_len, value_len;
667 char *cp;
668
669 /* Find/create the appropriate chunk */
670 if (!ch) {
671 text_chunks = ztr_find_chunks(z, ZTR_TYPE_TEXT, &ntext_chunks);
672 if (!text_chunks) {
673 ch = ztr_new_chunk(z, ZTR_TYPE_TEXT, NULL, 0, NULL, 0);
674 } else {
675 ch = text_chunks[0];
676 xfree(text_chunks);
677 }
678 }
679
680 if (ch->type != ZTR_TYPE_TEXT)
681 return NULL;
682
683 /* Make sure it's not compressed */
684 uncompress_chunk(z, ch);
685
686 /* Append key\0value\0 */
687 key_len = strlen(key);
688 value_len = strlen(value);
689 cp = ch->data;
690 if (cp) {
691 /* Set ch->dlength to the last non-nul byte of the previous value */
692 while (ch->dlength && ch->data[ch->dlength-1] == 0)
693 ch->dlength--;
694 }
695
696 cp = realloc(ch->data, 1 + ch->dlength + key_len + value_len + 3);
697 if (NULL == cp)
698 return NULL;
699 else
700 ch->data = cp;
701
702 cp = &ch->data[ch->dlength];
703
704 /*
705 * Note this is a bit cryptic, but it works.
706 * When appending to an existing text chunk we write a preceeding nul
707 * to mark the end of the previous value (we rewound above specifically
708 * for this case).
709 * When creating a new chunk we still write a nul, but in this case it's
710 * the RAW format byte. After the value we add an extra nul to
711 * indicate the last entry.
712 */
713 ch->dlength += 1+sprintf(cp, "%c%s%c%s%c", 0, key, 0, value, 0);
714
715 return ch;
716 }
717
718
719
720 /*
721 * Stores held ztr huffman_codes as ZTR chunks.
722 * Returns 0 for success
723 * -1 for failure
724 */
725 int ztr_store_hcodes(ztr_t *ztr) {
726 int i;
727 ztr_chunk_t *chunks;
728 int nchunks;
729
730 if (ztr->nhcodes == 0)
731 return 0;
732
733 /* Extend chunks array */
734 nchunks = ztr->nchunks + ztr->nhcodes;
735 chunks = (ztr_chunk_t *)realloc(ztr->chunk, nchunks * sizeof(*chunks));
736 if (!chunks)
737 return -1;
738 ztr->chunk = chunks;
739
740 /* Encode */
741 for (i = 0; i < ztr->nhcodes; i++) {
742 block_t *blk = block_create(NULL, 2);
743 int j = ztr->nchunks;
744 unsigned char bytes[2];
745
746 ztr->chunk[j].type = ZTR_TYPE_HUFF;
747 ztr->chunk[j].mdata = 0;
748 ztr->chunk[j].mdlength = 0;
749 ztr->chunk[j].ztr_owns = 1;
750 bytes[0] = 0;
751 bytes[1] = ztr->hcodes[i].codes->code_set;
752 store_bytes(blk, bytes, 2);
753 /* FIXME: Now already cached in ztr_hcode_t */
754 if (0 == store_codes(blk, ztr->hcodes[i].codes, 1)) {
755 /* Last byte is always merged with first of stream */
756 if (blk->bit == 0) {
757 unsigned char zero = 0;
758 store_bytes(blk, &zero, 1);
759 }
760
761 ztr->chunk[j].data = (char *)blk->data;
762 ztr->chunk[j].dlength = blk->byte + (blk->bit != 0);
763 block_destroy(blk, 1);
764 ztr->nchunks++;
765 }
766 }
767
768 return ztr->nchunks == nchunks ? 0 : -1;
769 }
770
771 /*
772 * Given a ZTR chunk this searches through the meta-data key/value pairings
773 * to return the corresponding value.
774 *
775 * Returns a pointer into the mdata on success (nul-terminated)
776 * NULL on failure.
777 */
778 char *ztr_lookup_mdata_value(ztr_t *z, ztr_chunk_t *chunk, char *key) {
779 if (z->header.version_major > 1 ||
780 z->header.version_minor >= 2) {
781 /* ZTR format 1.2 onwards */
782 char *cp = chunk->mdata;
783 int32_t dlen = chunk->mdlength;
784
785 /*
786 * NB: we may wish to rewrite this using a dedicated state machine
787 * instead of strlen/strcmp as this currently assumes the meta-
788 * data is correctly formatted, which we cannot assume as the
789 * metadata is external and outside of our control.
790 * Passing in non-nul terminated strings could crash this code.
791 */
792 while (dlen > 0) {
793 size_t l;
794 int found;
795
796 /* key */
797 l = strlen(cp);
798 found = strcmp(cp, key) == 0;
799 cp += l+1;
800 dlen -= l+1;
801
802 /* value */
803 if (found)
804 return cp;
805 l = strlen(cp);
806 cp += l+1;
807 dlen -= l+1;
808 }
809 return NULL;
810
811 } else {
812 /* v1.1 and before only supported a few types, specifically coded
813 * per chunk type.
814 */
815 switch (chunk->type) {
816 case ZTR_TYPE_SAMP:
817 case ZTR_TYPE_SMP4:
818 if (strcmp(key, "TYPE"))
819 return chunk->mdata;
820 break;
821
822 default:
823 break;
824 }
825 }
826
827 return NULL;
828 }
829
830 /*
831 * Compresses an individual chunk using a specific format. The format is one
832 * of the 'format' fields listed in the spec; one of the ZTR_FORM_ macros.
833 */
834 int compress_chunk(ztr_t *ztr, ztr_chunk_t *chunk, int format,
835 int option, int option2) {
836 char *new_data = NULL;
837 int new_len;
838
839 switch (format) {
840 case ZTR_FORM_RAW:
841 return 0;
842
843 case ZTR_FORM_RLE:
844 new_data = rle(chunk->data, chunk->dlength, option, &new_len);
845 if (entropy((unsigned char *)new_data, new_len) >=
846 entropy((unsigned char *)chunk->data, chunk->dlength)) {
847 xfree(new_data);
848 return 0;
849 }
850 break;
851
852 case ZTR_FORM_XRLE:
853 new_data = xrle(chunk->data, chunk->dlength, option,option2, &new_len);
854 break;
855
856 case ZTR_FORM_XRLE2:
857 new_data = xrle2(chunk->data, chunk->dlength, option, &new_len);
858 break;
859
860 case ZTR_FORM_ZLIB:
861 new_data = zlib_huff(chunk->data, chunk->dlength, option, &new_len);
862 break;
863
864 case ZTR_FORM_DELTA1:
865 new_data = decorrelate1(chunk->data, chunk->dlength, option, &new_len);
866 break;
867
868 case ZTR_FORM_DDELTA1:
869 new_data = decorrelate1dyn(chunk->data, chunk->dlength, &new_len);
870 break;
871
872 case ZTR_FORM_DELTA2:
873 new_data = decorrelate2(chunk->data, chunk->dlength, option, &new_len);
874 break;
875
876 case ZTR_FORM_DDELTA2:
877 new_data = decorrelate2dyn(chunk->data, chunk->dlength, &new_len);
878 break;
879
880 case ZTR_FORM_DELTA4:
881 new_data = decorrelate4(chunk->data, chunk->dlength, option, &new_len);
882 break;
883
884 case ZTR_FORM_16TO8:
885 new_data = shrink_16to8(chunk->data, chunk->dlength, &new_len);
886 break;
887
888 case ZTR_FORM_32TO8:
889 new_data = shrink_32to8(chunk->data, chunk->dlength, &new_len);
890 break;
891
892 case ZTR_FORM_FOLLOW1:
893 new_data = follow1(chunk->data, chunk->dlength, &new_len);
894 break;
895
896 case ZTR_FORM_ICHEB:
897 new_data = ichebcomp(chunk->data, chunk->dlength, &new_len);
898 break;
899
900 case ZTR_FORM_LOG2:
901 new_data = log2_data(chunk->data, chunk->dlength, &new_len);
902 break;
903
904 case ZTR_FORM_STHUFF:
905 new_data = sthuff(ztr, chunk->data, chunk->dlength,
906 option, option2, &new_len);
907 break;
908
909 case ZTR_FORM_QSHIFT:
910 new_data = qshift(chunk->data, chunk->dlength, &new_len);
911 break;
912
913 case ZTR_FORM_TSHIFT:
914 new_data = tshift(ztr, chunk->data, chunk->dlength, &new_len);
915 break;
916 }
917
918 if (!new_data) {
919 fprintf(stderr, "!!ERROR!!\n");
920 return -1;
921 }
922
923 /*
924 fprintf(stderr, "Format %d => %d to %d\n", format, chunk->dlength, new_len);
925 */
926
927 chunk->dlength = new_len;
928 xfree(chunk->data);
929 chunk->data = new_data;
930
931 return 0;
932 }
933
934 /*
935 * Uncompresses an individual chunk from all levels of compression.
936 */
937 int uncompress_chunk(ztr_t *ztr, ztr_chunk_t *chunk) {
938 char *new_data = NULL;
939 int new_len;
940
941 while (chunk->dlength > 0 && chunk->data[0] != ZTR_FORM_RAW) {
942 switch (chunk->data[0]) {
943 case ZTR_FORM_RLE:
944 new_data = unrle(chunk->data, chunk->dlength, &new_len);
945 break;
946
947 case ZTR_FORM_XRLE:
948 new_data = unxrle(chunk->data, chunk->dlength, &new_len);
949 break;
950
951 case ZTR_FORM_XRLE2:
952 new_data = unxrle2(chunk->data, chunk->dlength, &new_len);
953 break;
954
955 case ZTR_FORM_ZLIB:
956 new_data = zlib_dehuff(chunk->data, chunk->dlength, &new_len);
957 break;
958
959 case ZTR_FORM_DELTA1:
960 new_data = recorrelate1(chunk->data, chunk->dlength, &new_len);
961 break;
962
963 case ZTR_FORM_DELTA2:
964 new_data = recorrelate2(chunk->data, chunk->dlength, &new_len);
965 break;
966
967 case ZTR_FORM_DELTA4:
968 new_data = recorrelate4(chunk->data, chunk->dlength, &new_len);
969 break;
970
971 case ZTR_FORM_16TO8:
972 new_data = expand_8to16(chunk->data, chunk->dlength, &new_len);
973 break;
974
975 case ZTR_FORM_32TO8:
976 new_data = expand_8to32(chunk->data, chunk->dlength, &new_len);
977 break;
978
979 case ZTR_FORM_FOLLOW1:
980 new_data = unfollow1(chunk->data, chunk->dlength, &new_len);
981 break;
982
983 case ZTR_FORM_ICHEB:
984 new_data = ichebuncomp(chunk->data, chunk->dlength, &new_len);
985 break;
986
987 case ZTR_FORM_LOG2:
988 new_data = unlog2_data(chunk->data, chunk->dlength, &new_len);
989 break;
990
991 case ZTR_FORM_STHUFF:
992 new_data = unsthuff(ztr, chunk->data, chunk->dlength, &new_len);
993 break;
994
995 case ZTR_FORM_QSHIFT:
996 new_data = unqshift(chunk->data, chunk->dlength, &new_len);
997 break;
998
999 case ZTR_FORM_TSHIFT:
1000 new_data = untshift(ztr, chunk->data, chunk->dlength, &new_len);
1001 break;
1002
1003 default:
1004 fprintf(stderr, "Unknown encoding format %d\n", chunk->data[0]);
1005 return -1;
1006 }
1007
1008 if (!new_data)
1009 return -1;
1010
1011 /*
1012 fprintf(stderr, "format %d => %d to %d\n",
1013 chunk->data[0], chunk->dlength, new_len);
1014 */
1015
1016 chunk->dlength = new_len;
1017 xfree(chunk->data);
1018 chunk->data = new_data;
1019 }
1020
1021 return 0;
1022 }
1023
1024 /*
1025 * Compresses a ztr (in memory).
1026 * Level is 0, 1, 2 or 3 (no compression, delta, delta + zlib,
1027 * chebyshev + zlib).
1028 */
1029 int compress_ztr(ztr_t *ztr, int level) {
1030 int i;
1031
1032 if (0 == level)
1033 return 0;
1034
1035 for (i = 0; i < ztr->nchunks; i++) {
1036 /*
1037 {
1038 char str[5];
1039 fprintf(stderr, "---- %.4s ----\n",
1040 ZTR_BE2STR(ztr->chunk[i].type,str));
1041 }
1042 fprintf(stderr, "Uncomp length=%d\n", ztr->chunk[i].dlength);
1043 */
1044
1045 switch(ztr->chunk[i].type) {
1046 char *type;
1047 case ZTR_TYPE_SAMP:
1048 case ZTR_TYPE_SMP4:
1049 #ifdef ILLUMINA_GA
1050 compress_chunk(ztr, &ztr->chunk[i],
1051 ZTR_FORM_STHUFF, CODE_TRACES, 0);
1052 #else
1053 type = ztr_lookup_mdata_value(ztr, &ztr->chunk[i], "TYPE");
1054 if (type && 0 == strcmp(type, "PYRW")) {
1055 /* Raw data is not really compressable */
1056 } else if (type && 0 == strcmp(type, "PYNO")) {
1057 if (level > 1) {
1058 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8, 0, 0);
1059 compress_chunk(ztr, &ztr->chunk[i],
1060 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1061 }
1062 } else {
1063 if (level <= 2) {
1064 /*
1065 * Experiments show that typically a double delta does
1066 * better than a single delta for 8-bit data, and the other
1067 * way around for 16-bit data
1068 */
1069 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA2,
1070 ztr->delta_level, 0);
1071 } else {
1072 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_ICHEB, 0, 0);
1073 }
1074
1075 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8, 0, 0);
1076 if (level > 1) {
1077 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_FOLLOW1,0, 0);
1078 /*
1079 compress_chunk(ztr, &ztr->chunk[i],
1080 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY);
1081 */
1082 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 150, 0);
1083 compress_chunk(ztr, &ztr->chunk[i],
1084 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1085 }
1086 }
1087 #endif
1088 break;
1089
1090 case ZTR_TYPE_BASE:
1091 #ifdef ILLUMINA_GA
1092 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_STHUFF, CODE_DNA, 0);
1093 #else
1094 if (level > 1) {
1095 compress_chunk(ztr, &ztr->chunk[i],
1096 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1097 }
1098 #endif
1099 break;
1100
1101 case ZTR_TYPE_CNF1:
1102 case ZTR_TYPE_CNF4:
1103 case ZTR_TYPE_CSID:
1104 #ifdef ILLUMINA_GA
1105 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 77, 0);
1106 compress_chunk(ztr, &ztr->chunk[i],
1107 ZTR_FORM_STHUFF, CODE_CONF_RLE, 0);
1108 #else
1109 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA1, 1, 0);
1110 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 77, 0);
1111 if (level > 1) {
1112 compress_chunk(ztr, &ztr->chunk[i],
1113 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1114 }
1115 #endif
1116 break;
1117
1118 case ZTR_TYPE_BPOS:
1119 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA4, 1, 0);
1120 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_32TO8, 0, 0);
1121 if (level > 1) {
1122 compress_chunk(ztr, &ztr->chunk[i],
1123 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1124 }
1125 break;
1126
1127 case ZTR_TYPE_TEXT:
1128 #ifdef ILLUMINA_GA
1129 #else
1130 if (level > 1) {
1131 compress_chunk(ztr, &ztr->chunk[i],
1132 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0);
1133 }
1134 #endif
1135 break;
1136
1137 case ZTR_TYPE_FLWO:
1138 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_XRLE, 0, 4);
1139 break;
1140
1141 }
1142
1143 /*
1144 fprintf(stderr, "Comp length=%d\n", ztr->chunk[i].dlength);
1145 */
1146 }
1147
1148 return 0;
1149 }
1150
1151 /*
1152 * Uncompresses a ztr (in memory).
1153 */
1154 int uncompress_ztr(ztr_t *ztr) {
1155 int i;
1156
1157 for (i = 0; i < ztr->nchunks; i++) {
1158 /*
1159 {
1160 char str[5];
1161 fprintf(stderr, "---- %.4s ----\n",
1162 ZTR_BE2STR(ztr->chunk[i].type,str));
1163 }
1164 */
1165 uncompress_chunk(ztr, &ztr->chunk[i]);
1166 }
1167
1168 return 0;
1169 }
1170