Mercurial > repos > dawe > srf2fastq
comparison srf2fastq/io_lib-1.12.2/io_lib/ztr.c @ 0:d901c9f41a6a default tip
Migrated tool version 1.0.1 from old tool shed archive to new tool shed repository
author | dawe |
---|---|
date | Tue, 07 Jun 2011 17:48:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d901c9f41a6a |
---|---|
1 #include <stdio.h> | |
2 #include <string.h> | |
3 #include <math.h> | |
4 | |
5 /* #include <fcntl.h> */ | |
6 | |
7 #include "io_lib/ztr.h" | |
8 #include "io_lib/xalloc.h" | |
9 #include "io_lib/Read.h" | |
10 #include "io_lib/compression.h" | |
11 #include "io_lib/stdio_hack.h" | |
12 #include "io_lib/deflate_interlaced.h" | |
13 | |
14 /* Deprecated #define - see solexa2srf for the more up to date code */ | |
15 /* #define ILLUMINA_GA */ | |
16 | |
17 /* | |
18 * --------------------------------------------------------------------------- | |
19 * Trace writing functions. | |
20 * These consist of several encoding functions, all with the same prototype, | |
21 * and a single fwrite_ztr function to wrap it all up. | |
22 * --------------------------------------------------------------------------- | |
23 */ | |
24 | |
25 /* | |
26 * ztr_write_header | |
27 * | |
28 * Writes a ZTR file header. | |
29 * | |
30 * Arguments: | |
31 * fp A FILE pointer | |
32 * h A pointer to the header to write | |
33 * | |
34 * Returns: | |
35 * Success: 0 | |
36 * Failure: -1 | |
37 */ | |
38 static int ztr_write_header(FILE *fp, ztr_header_t *h) { | |
39 if (1 != fwrite(h, sizeof(*h), 1, fp)) | |
40 return -1; | |
41 | |
42 return 0; | |
43 } | |
44 | |
45 /* | |
46 * ztr_write_chunk | |
47 * | |
48 * Writes a ZTR chunk including chunk header and data | |
49 * | |
50 * Arguments: | |
51 * fp A FILE pointer | |
52 * chunk A pointer to the chunk to write | |
53 * | |
54 * Returns: | |
55 * Success: 0 | |
56 * Failure: -1 | |
57 */ | |
58 static int ztr_write_chunk(FILE *fp, ztr_chunk_t *chunk) { | |
59 int4 bei4; | |
60 | |
61 /* | |
62 { | |
63 char str[5]; | |
64 fprintf(stderr, "Write chunk %.4s %08x length %d\n", | |
65 ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength); | |
66 } | |
67 */ | |
68 | |
69 /* type */ | |
70 bei4 = be_int4(chunk->type); | |
71 if (1 != fwrite(&bei4, 4, 1, fp)) | |
72 return -1; | |
73 | |
74 /* metadata length */ | |
75 bei4 = be_int4(chunk->mdlength); | |
76 if (1 != fwrite(&bei4, 4, 1, fp)) | |
77 return -1; | |
78 | |
79 /* metadata */ | |
80 if (chunk->mdlength) | |
81 if (chunk->mdlength != fwrite(chunk->mdata, 1, chunk->mdlength, fp)) | |
82 return -1; | |
83 | |
84 /* data length */ | |
85 bei4 = be_int4(chunk->dlength); | |
86 if (1 != fwrite(&bei4, 4, 1, fp)) | |
87 return -1; | |
88 | |
89 /* data */ | |
90 if (chunk->dlength != fwrite(chunk->data, 1, chunk->dlength, fp)) | |
91 return -1; | |
92 | |
93 return 0; | |
94 } | |
95 | |
96 /* | |
97 * fwrite_ztr | |
98 * | |
99 * Writes a ZTR file held in the ztr_t structure. | |
100 * It is assumed that all the correct lengths, magic numbers, etc in the | |
101 * ztr_t struct have already been initialised correctly. | |
102 * | |
103 * FIXME: Add a 'method' argument which encodes formats? Perhaps store this | |
104 * in the ztr struct? | |
105 * | |
106 * Arguments: | |
107 * fp A writable FILE pointer | |
108 * ztr A pointer to the ztr_t struct to write. | |
109 * | |
110 * Returns: | |
111 * Success: 0 | |
112 * Failure: -1 | |
113 */ | |
114 int fwrite_ztr(FILE *fp, ztr_t *ztr) { | |
115 int i; | |
116 | |
117 /* Write the header record */ | |
118 if (-1 == ztr_write_header(fp, &ztr->header)) | |
119 return -1; | |
120 | |
121 /* Write the chunks */ | |
122 for (i = 0; i < ztr->nchunks; i++) { | |
123 if (-1 == ztr_write_chunk(fp, &ztr->chunk[i])) | |
124 return -1; | |
125 #if 0 | |
126 { | |
127 int fd; | |
128 char fname[1024]; | |
129 sprintf(fname, "chunk.%d", i); | |
130 fd = open(fname, O_RDWR|O_CREAT|O_TRUNC, 0666); | |
131 write(fd, ztr->chunk[i].data, ztr->chunk[i].dlength); | |
132 close(fd); | |
133 } | |
134 #endif | |
135 } | |
136 | |
137 return 0; | |
138 } | |
139 | |
140 /* | |
141 * --------------------------------------------------------------------------- | |
142 * Trace reading functions. | |
143 * These consist of several decoding functions, all with the same prototype, | |
144 * and a single fread_ztr function to wrap it all up. | |
145 * --------------------------------------------------------------------------- | |
146 */ | |
147 | |
148 /* | |
149 * ztr_read_header | |
150 * | |
151 * Reads a ZTR file header. | |
152 * | |
153 * Arguments: | |
154 * fp A FILE pointer | |
155 * h Where to write the header to | |
156 * | |
157 * Returns: | |
158 * Success: 0 | |
159 * Failure: -1 | |
160 */ | |
161 int ztr_read_header(FILE *fp, ztr_header_t *h) { | |
162 if (1 != fread(h, sizeof(*h), 1, fp)) | |
163 return -1; | |
164 | |
165 return 0; | |
166 } | |
167 | |
168 /* | |
169 * ztr_read_chunk_hdr | |
170 * | |
171 * Reads a ZTR chunk header and metadata, but not the main data segment. | |
172 * | |
173 * Arguments: | |
174 * fp A FILE pointer | |
175 * | |
176 * Returns: | |
177 * Success: a chunk pointer (malloced) | |
178 * Failure: NULL | |
179 */ | |
180 ztr_chunk_t *ztr_read_chunk_hdr(FILE *fp) { | |
181 int4 bei4; | |
182 ztr_chunk_t *chunk; | |
183 | |
184 if (NULL == (chunk = (ztr_chunk_t *)xmalloc(sizeof(*chunk)))) | |
185 return NULL; | |
186 | |
187 /* type */ | |
188 if (1 != fread(&bei4, 4, 1, fp)) { | |
189 xfree(chunk); | |
190 return NULL; | |
191 } | |
192 chunk->type = be_int4(bei4); | |
193 | |
194 /* metadata length */ | |
195 if (1 != fread(&bei4, 4, 1, fp)) { | |
196 xfree(chunk); | |
197 return NULL; | |
198 } | |
199 chunk->mdlength = be_int4(bei4); | |
200 | |
201 /* metadata */ | |
202 chunk->ztr_owns = 1; | |
203 if (chunk->mdlength) { | |
204 if (NULL == (chunk->mdata = (char *)xmalloc(chunk->mdlength))) { | |
205 xfree(chunk); | |
206 return NULL; | |
207 } | |
208 if (chunk->mdlength != fread(chunk->mdata, 1, chunk->mdlength, fp)) { | |
209 xfree(chunk->mdata); | |
210 xfree(chunk); | |
211 return NULL; | |
212 } | |
213 } else { | |
214 chunk->mdata = NULL; | |
215 } | |
216 | |
217 /* data length */ | |
218 if (1 != fread(&bei4, 4, 1, fp)) { | |
219 if (chunk->mdata) | |
220 xfree(chunk->mdata); | |
221 xfree(chunk); | |
222 return NULL; | |
223 } | |
224 chunk->dlength = be_int4(bei4); | |
225 | |
226 return chunk; | |
227 } | |
228 | |
229 void ztr_process_text(ztr_t *ztr) { | |
230 int i; | |
231 ztr_chunk_t **text_chunks = NULL; | |
232 int ntext_chunks = 0; | |
233 ztr_text_t *zt = NULL; | |
234 int nzt = 0; | |
235 int nalloc = 0; | |
236 | |
237 if (ztr->text_segments) | |
238 /* Already done */ | |
239 return; | |
240 | |
241 text_chunks = ztr_find_chunks(ztr, ZTR_TYPE_TEXT, &ntext_chunks); | |
242 if (!text_chunks) | |
243 return; | |
244 | |
245 for (i = 0; i < ntext_chunks; i++) { | |
246 char *data; | |
247 uint4 length; | |
248 char *ident, *value; | |
249 | |
250 /* Make sure it's not compressed */ | |
251 uncompress_chunk(ztr, text_chunks[i]); | |
252 | |
253 data = text_chunks[i]->data; | |
254 length = text_chunks[i]->dlength; | |
255 | |
256 if (!length) | |
257 continue; | |
258 | |
259 /* Skip RAW header byte */ | |
260 data++; | |
261 length--; | |
262 | |
263 while (data - text_chunks[i]->data <= (ptrdiff_t)length && | |
264 *(ident = data)) { | |
265 data += strlen(ident)+1; | |
266 value = data; | |
267 if (value) | |
268 data += strlen(value)+1; | |
269 | |
270 if (nzt + 1 > nalloc) { | |
271 nalloc += 10; | |
272 zt = (ztr_text_t *)xrealloc(zt, nalloc * sizeof(*zt)); | |
273 } | |
274 zt[nzt].ident = ident; | |
275 zt[nzt].value = value; | |
276 nzt++; | |
277 } | |
278 } | |
279 | |
280 ztr->text_segments = zt; | |
281 ztr->ntext_segments = nzt; | |
282 | |
283 /* | |
284 for (i = 0; i < ztr->ntext_segments; i++) { | |
285 fprintf(stderr, "'%s' = '%s'\n", | |
286 ztr->text_segments[i].ident, | |
287 ztr->text_segments[i].value); | |
288 } | |
289 */ | |
290 | |
291 xfree(text_chunks); | |
292 } | |
293 | |
294 | |
295 /* | |
296 * fread_ztr | |
297 * | |
298 * Reads a ZTR file from 'fp'. This checks for the correct magic number and | |
299 * major version number, but not minor version number. | |
300 * | |
301 * FIXME: Add automatic uncompression? | |
302 * | |
303 * Arguments: | |
304 * fp A readable FILE pointer | |
305 * | |
306 * Returns: | |
307 * Success: Pointer to a ztr_t structure (malloced) | |
308 * Failure: NULL | |
309 */ | |
310 ztr_t *fread_ztr(FILE *fp) { | |
311 ztr_t *ztr; | |
312 ztr_chunk_t *chunk; | |
313 int sections = read_sections(0); | |
314 | |
315 /* Allocate */ | |
316 if (NULL == (ztr = new_ztr())) | |
317 return NULL; | |
318 | |
319 /* Read the header */ | |
320 if (-1 == ztr_read_header(fp, &ztr->header)) | |
321 return NULL; | |
322 | |
323 /* Check magic number and version */ | |
324 if (memcmp(ztr->header.magic, ZTR_MAGIC, 8) != 0) | |
325 return NULL; | |
326 | |
327 if (ztr->header.version_major != ZTR_VERSION_MAJOR) | |
328 return NULL; | |
329 | |
330 /* Load chunks */ | |
331 while (chunk = ztr_read_chunk_hdr(fp)) { | |
332 /* | |
333 char str[5]; | |
334 | |
335 fprintf(stderr, "Read chunk %.4s %08x length %d\n", | |
336 ZTR_BE2STR(chunk->type, str), chunk->type, chunk->dlength); | |
337 */ | |
338 switch(chunk->type) { | |
339 case ZTR_TYPE_HEADER: | |
340 /* End of file */ | |
341 return ztr; | |
342 | |
343 case ZTR_TYPE_SAMP: | |
344 case ZTR_TYPE_SMP4: | |
345 if (! (sections & READ_SAMPLES)) { | |
346 fseek(fp, chunk->dlength, SEEK_CUR); | |
347 xfree(chunk); | |
348 continue; | |
349 } | |
350 break; | |
351 | |
352 | |
353 case ZTR_TYPE_BASE: | |
354 case ZTR_TYPE_BPOS: | |
355 case ZTR_TYPE_CNF4: | |
356 case ZTR_TYPE_CNF1: | |
357 case ZTR_TYPE_CSID: | |
358 if (! (sections & READ_BASES)) { | |
359 fseek(fp, chunk->dlength, SEEK_CUR); | |
360 xfree(chunk); | |
361 continue; | |
362 } | |
363 break; | |
364 | |
365 case ZTR_TYPE_TEXT: | |
366 if (! (sections & READ_COMMENTS)) { | |
367 fseek(fp, chunk->dlength, SEEK_CUR); | |
368 xfree(chunk); | |
369 continue; | |
370 } | |
371 break; | |
372 | |
373 case ZTR_TYPE_CLIP: | |
374 case ZTR_TYPE_FLWO: | |
375 case ZTR_TYPE_FLWC: | |
376 break; | |
377 | |
378 /* | |
379 default: | |
380 fprintf(stderr, "Unknown chunk type '%s': skipping\n", | |
381 ZTR_BE2STR(chunk->type,str)); | |
382 fseek(fp, chunk->dlength, SEEK_CUR); | |
383 xfree(chunk); | |
384 continue; | |
385 */ | |
386 } | |
387 | |
388 chunk->ztr_owns = 1; | |
389 chunk->data = (char *)xmalloc(chunk->dlength); | |
390 if (chunk->dlength != fread(chunk->data, 1, chunk->dlength, fp)) { | |
391 delete_ztr(ztr); | |
392 return NULL; | |
393 } | |
394 | |
395 ztr->nchunks++; | |
396 ztr->chunk = (ztr_chunk_t *)xrealloc(ztr->chunk, ztr->nchunks * | |
397 sizeof(ztr_chunk_t)); | |
398 memcpy(&ztr->chunk[ztr->nchunks-1], chunk, sizeof(*chunk)); | |
399 xfree(chunk); | |
400 } | |
401 | |
402 return ztr; | |
403 } | |
404 | |
405 /* | |
406 * --------------------------------------------------------------------------- | |
407 * Other utility functions | |
408 * --------------------------------------------------------------------------- | |
409 */ | |
410 /* | |
411 * new_ztr | |
412 * | |
413 * Allocates and initialises a ztr_t structure | |
414 * | |
415 * Returns: | |
416 * ztr_t pointer on success | |
417 * NULL on failure | |
418 */ | |
419 ztr_t *new_ztr(void) { | |
420 ztr_t *ztr; | |
421 | |
422 /* Allocate */ | |
423 if (NULL == (ztr = (ztr_t *)xmalloc(sizeof(*ztr)))) | |
424 return NULL; | |
425 | |
426 ztr->chunk = NULL; | |
427 ztr->nchunks = 0; | |
428 ztr->text_segments = NULL; | |
429 ztr->ntext_segments = 0; | |
430 ztr->delta_level = 3; | |
431 | |
432 ztr->nhcodes = 0; | |
433 ztr->hcodes = NULL; | |
434 ztr->hcodes_checked = 0; | |
435 | |
436 return ztr; | |
437 } | |
438 | |
439 void delete_ztr(ztr_t *ztr) { | |
440 int i; | |
441 | |
442 if (!ztr) | |
443 return; | |
444 | |
445 if (ztr->chunk) { | |
446 for (i = 0; i < ztr->nchunks; i++) { | |
447 if (ztr->chunk[i].data && ztr->chunk[i].ztr_owns) | |
448 xfree(ztr->chunk[i].data); | |
449 if (ztr->chunk[i].mdata && ztr->chunk[i].ztr_owns) | |
450 xfree(ztr->chunk[i].mdata); | |
451 } | |
452 xfree(ztr->chunk); | |
453 } | |
454 | |
455 if (ztr->hcodes) { | |
456 for (i = 0; i < ztr->nhcodes; i++) { | |
457 if (ztr->hcodes[i].codes && ztr->hcodes[i].ztr_owns) | |
458 huffman_codeset_destroy(ztr->hcodes[i].codes); | |
459 } | |
460 free(ztr->hcodes); | |
461 } | |
462 | |
463 if (ztr->text_segments) | |
464 xfree(ztr->text_segments); | |
465 | |
466 xfree(ztr); | |
467 } | |
468 | |
469 /* | |
470 * ztr_find_chunks | |
471 * | |
472 * Searches for chunks of a specific type. | |
473 * | |
474 * Returns: | |
475 * Array of ztr_chunk_t pointers (into the ztr struct). This is | |
476 * allocated by malloc and it is the callers duty to free this. | |
477 * NULL if none found. | |
478 */ | |
479 ztr_chunk_t **ztr_find_chunks(ztr_t *ztr, uint4 type, int *nchunks_p) { | |
480 ztr_chunk_t **chunks = NULL; | |
481 int nchunks = 0; | |
482 int i; | |
483 | |
484 for (i = 0; i < ztr->nchunks; i++) { | |
485 if (ztr->chunk[i].type == type) { | |
486 chunks = (ztr_chunk_t **)xrealloc(chunks, (nchunks + 1) * | |
487 sizeof(*chunks)); | |
488 chunks[nchunks++] = &ztr->chunk[i]; | |
489 } | |
490 } | |
491 *nchunks_p = nchunks; | |
492 return chunks; | |
493 } | |
494 | |
495 /* | |
496 * Shannon showed that for storage in base 'b' with alphabet symbols 'a' having | |
497 * a probability of ocurring in any context of 'Pa' we should encode | |
498 * symbol 'a' to have a storage width of -logb(Pa). | |
499 * | |
500 * Eg. b = 26, P(e) = .22. => width .4647277. | |
501 * | |
502 * We use this to calculate the entropy of a signal by summing over all letters | |
503 * in the signal. In this case, our storage has base 256. | |
504 */ | |
505 #define EBASE 256 | |
506 static double entropy(unsigned char *data, int len) { | |
507 double E[EBASE]; | |
508 double P[EBASE]; | |
509 double e; | |
510 int i; | |
511 | |
512 for (i = 0; i < EBASE; i++) | |
513 P[i] = 0; | |
514 | |
515 for (i = 0; i < len; i++) | |
516 P[data[i]]++; | |
517 | |
518 for (i = 0; i < EBASE; i++) { | |
519 if (P[i]) { | |
520 P[i] /= len; | |
521 E[i] = -(log(P[i])/log(EBASE)); | |
522 } else { | |
523 E[i] = 0; | |
524 } | |
525 } | |
526 | |
527 for (e = i = 0; i < len; i++) | |
528 e += E[data[i]]; | |
529 | |
530 return e; | |
531 } | |
532 | |
533 /* | |
534 * Adds a user-defined huffman_codeset_t code-set to the available code sets | |
535 * used by huffman_encode or huffman_decode. | |
536 * | |
537 * Note that the 'codes' memory is then "owned" by the ztr object if "ztr_owns" | |
538 * is true and will be deallocated when the ztr object is destroyed. Otherwise | |
539 * freeing the ztr object will not touch the passed in codes. | |
540 */ | |
541 ztr_hcode_t *ztr_add_hcode(ztr_t *ztr, huffman_codeset_t *codes, | |
542 int ztr_owns) { | |
543 if (!codes) | |
544 return NULL; | |
545 | |
546 ztr->hcodes = realloc(ztr->hcodes, (ztr->nhcodes+1)*sizeof(*ztr->hcodes)); | |
547 ztr->hcodes[ztr->nhcodes].codes = codes; | |
548 ztr->hcodes[ztr->nhcodes].ztr_owns = ztr_owns; | |
549 | |
550 return &ztr->hcodes[ztr->nhcodes++]; | |
551 } | |
552 | |
553 /* | |
554 * Searches through the cached huffman_codeset_t tables looking for a stored | |
555 * huffman code of type 'code_set'. | |
556 * NB: only code_sets >= CODE_USER will be stored here. | |
557 * | |
558 * Returns codes on success, | |
559 * NULL on failure | |
560 */ | |
561 ztr_hcode_t *ztr_find_hcode(ztr_t *ztr, int code_set) { | |
562 int i; | |
563 | |
564 if (code_set < CODE_USER) | |
565 return NULL; /* computed on-the-fly or use a hard-coded set */ | |
566 | |
567 /* Check through chunks for undecoded HUFF chunks */ | |
568 if (!ztr->hcodes_checked) { | |
569 for (i = 0; i < ztr->nchunks; i++) { | |
570 if (ztr->chunk[i].type == ZTR_TYPE_HUFF) { | |
571 block_t *blk; | |
572 huffman_codeset_t *cs; | |
573 uncompress_chunk(ztr, &ztr->chunk[i]); | |
574 blk = block_create((unsigned char *)(ztr->chunk[i].data+2), | |
575 ztr->chunk[i].dlength-2); | |
576 cs = restore_codes(blk, NULL); | |
577 if (!cs) { | |
578 block_destroy(blk, 1); | |
579 return NULL; | |
580 } | |
581 cs->code_set = (unsigned char)(ztr->chunk[i].data[1]); | |
582 ztr_add_hcode(ztr, cs, 1); | |
583 block_destroy(blk, 1); | |
584 } | |
585 } | |
586 ztr->hcodes_checked = 1; | |
587 } | |
588 | |
589 /* Check cached copies */ | |
590 for (i = 0; i < ztr->nhcodes; i++) { | |
591 if (ztr->hcodes[i].codes->code_set == code_set) | |
592 return &ztr->hcodes[i]; | |
593 } | |
594 | |
595 return NULL; | |
596 } | |
597 | |
598 ztr_chunk_t *ztr_find_hcode_chunk(ztr_t *ztr, int code_set) { | |
599 int i; | |
600 | |
601 if (code_set < CODE_USER) | |
602 return NULL; /* computed on-the-fly or use a hard-coded set */ | |
603 | |
604 /* Check through chunks for undecoded HUFF chunks */ | |
605 for (i = 0; i < ztr->nchunks; i++) { | |
606 if (ztr->chunk[i].type == ZTR_TYPE_HUFF) { | |
607 uncompress_chunk(ztr, &ztr->chunk[i]); | |
608 if (ztr->chunk[i].dlength >= 2 && | |
609 (unsigned char)ztr->chunk[i].data[1] == code_set) | |
610 return &ztr->chunk[i]; | |
611 } | |
612 } | |
613 | |
614 return NULL; | |
615 } | |
616 | |
617 /* | |
618 * Adds a new chunk to a ztr file and returns the chunk pointer. | |
619 * The data and mdata fields can be NULL and the chunk will not be | |
620 * initialised. | |
621 * | |
622 * Returns new chunk ptr on success. | |
623 * NULL on failure. | |
624 */ | |
625 ztr_chunk_t *ztr_new_chunk(ztr_t *ztr, uint4 type, | |
626 char *data, uint4 dlength, | |
627 char *mdata, uint4 mdlength) { | |
628 ztr_chunk_t *chunks, *c; | |
629 | |
630 /* Grow the chunk array */ | |
631 chunks = (ztr_chunk_t *)realloc(ztr->chunk, | |
632 (ztr->nchunks+1) * sizeof(*chunks)); | |
633 if (!chunks) | |
634 return NULL; | |
635 ztr->chunk = chunks; | |
636 | |
637 /* Initialise */ | |
638 c = &ztr->chunk[ztr->nchunks++]; | |
639 c->type = type; | |
640 c->data = data; | |
641 c->dlength = dlength; | |
642 c->mdata = mdata; | |
643 c->mdlength = mdlength; | |
644 c->ztr_owns = 1; | |
645 | |
646 return c; | |
647 } | |
648 | |
649 /* | |
650 * Adds a key/value pair to a ztr TEXT chunk. | |
651 * The 'ch' chunk may be explicitly specified in which case the text | |
652 * is added to that chunk or it may be specified as NULL in which case | |
653 * the key/value pair are added to the first available TEXT chunk, | |
654 * possibly creating a new one if required. | |
655 * | |
656 * NOTE: If the key already exists in the text chunk this appends a new | |
657 * copy; it does not overwrite the old one. | |
658 * | |
659 * Returns ztr text chunk ptr for success | |
660 * NULL for failure | |
661 */ | |
662 ztr_chunk_t *ztr_add_text(ztr_t *z, ztr_chunk_t *ch, | |
663 const char *key, const char *value) { | |
664 ztr_chunk_t **text_chunks = NULL; | |
665 int ntext_chunks; | |
666 size_t key_len, value_len; | |
667 char *cp; | |
668 | |
669 /* Find/create the appropriate chunk */ | |
670 if (!ch) { | |
671 text_chunks = ztr_find_chunks(z, ZTR_TYPE_TEXT, &ntext_chunks); | |
672 if (!text_chunks) { | |
673 ch = ztr_new_chunk(z, ZTR_TYPE_TEXT, NULL, 0, NULL, 0); | |
674 } else { | |
675 ch = text_chunks[0]; | |
676 xfree(text_chunks); | |
677 } | |
678 } | |
679 | |
680 if (ch->type != ZTR_TYPE_TEXT) | |
681 return NULL; | |
682 | |
683 /* Make sure it's not compressed */ | |
684 uncompress_chunk(z, ch); | |
685 | |
686 /* Append key\0value\0 */ | |
687 key_len = strlen(key); | |
688 value_len = strlen(value); | |
689 cp = ch->data; | |
690 if (cp) { | |
691 /* Set ch->dlength to the last non-nul byte of the previous value */ | |
692 while (ch->dlength && ch->data[ch->dlength-1] == 0) | |
693 ch->dlength--; | |
694 } | |
695 | |
696 cp = realloc(ch->data, 1 + ch->dlength + key_len + value_len + 3); | |
697 if (NULL == cp) | |
698 return NULL; | |
699 else | |
700 ch->data = cp; | |
701 | |
702 cp = &ch->data[ch->dlength]; | |
703 | |
704 /* | |
705 * Note this is a bit cryptic, but it works. | |
706 * When appending to an existing text chunk we write a preceeding nul | |
707 * to mark the end of the previous value (we rewound above specifically | |
708 * for this case). | |
709 * When creating a new chunk we still write a nul, but in this case it's | |
710 * the RAW format byte. After the value we add an extra nul to | |
711 * indicate the last entry. | |
712 */ | |
713 ch->dlength += 1+sprintf(cp, "%c%s%c%s%c", 0, key, 0, value, 0); | |
714 | |
715 return ch; | |
716 } | |
717 | |
718 | |
719 | |
720 /* | |
721 * Stores held ztr huffman_codes as ZTR chunks. | |
722 * Returns 0 for success | |
723 * -1 for failure | |
724 */ | |
725 int ztr_store_hcodes(ztr_t *ztr) { | |
726 int i; | |
727 ztr_chunk_t *chunks; | |
728 int nchunks; | |
729 | |
730 if (ztr->nhcodes == 0) | |
731 return 0; | |
732 | |
733 /* Extend chunks array */ | |
734 nchunks = ztr->nchunks + ztr->nhcodes; | |
735 chunks = (ztr_chunk_t *)realloc(ztr->chunk, nchunks * sizeof(*chunks)); | |
736 if (!chunks) | |
737 return -1; | |
738 ztr->chunk = chunks; | |
739 | |
740 /* Encode */ | |
741 for (i = 0; i < ztr->nhcodes; i++) { | |
742 block_t *blk = block_create(NULL, 2); | |
743 int j = ztr->nchunks; | |
744 unsigned char bytes[2]; | |
745 | |
746 ztr->chunk[j].type = ZTR_TYPE_HUFF; | |
747 ztr->chunk[j].mdata = 0; | |
748 ztr->chunk[j].mdlength = 0; | |
749 ztr->chunk[j].ztr_owns = 1; | |
750 bytes[0] = 0; | |
751 bytes[1] = ztr->hcodes[i].codes->code_set; | |
752 store_bytes(blk, bytes, 2); | |
753 /* FIXME: Now already cached in ztr_hcode_t */ | |
754 if (0 == store_codes(blk, ztr->hcodes[i].codes, 1)) { | |
755 /* Last byte is always merged with first of stream */ | |
756 if (blk->bit == 0) { | |
757 unsigned char zero = 0; | |
758 store_bytes(blk, &zero, 1); | |
759 } | |
760 | |
761 ztr->chunk[j].data = (char *)blk->data; | |
762 ztr->chunk[j].dlength = blk->byte + (blk->bit != 0); | |
763 block_destroy(blk, 1); | |
764 ztr->nchunks++; | |
765 } | |
766 } | |
767 | |
768 return ztr->nchunks == nchunks ? 0 : -1; | |
769 } | |
770 | |
771 /* | |
772 * Given a ZTR chunk this searches through the meta-data key/value pairings | |
773 * to return the corresponding value. | |
774 * | |
775 * Returns a pointer into the mdata on success (nul-terminated) | |
776 * NULL on failure. | |
777 */ | |
778 char *ztr_lookup_mdata_value(ztr_t *z, ztr_chunk_t *chunk, char *key) { | |
779 if (z->header.version_major > 1 || | |
780 z->header.version_minor >= 2) { | |
781 /* ZTR format 1.2 onwards */ | |
782 char *cp = chunk->mdata; | |
783 int32_t dlen = chunk->mdlength; | |
784 | |
785 /* | |
786 * NB: we may wish to rewrite this using a dedicated state machine | |
787 * instead of strlen/strcmp as this currently assumes the meta- | |
788 * data is correctly formatted, which we cannot assume as the | |
789 * metadata is external and outside of our control. | |
790 * Passing in non-nul terminated strings could crash this code. | |
791 */ | |
792 while (dlen > 0) { | |
793 size_t l; | |
794 int found; | |
795 | |
796 /* key */ | |
797 l = strlen(cp); | |
798 found = strcmp(cp, key) == 0; | |
799 cp += l+1; | |
800 dlen -= l+1; | |
801 | |
802 /* value */ | |
803 if (found) | |
804 return cp; | |
805 l = strlen(cp); | |
806 cp += l+1; | |
807 dlen -= l+1; | |
808 } | |
809 return NULL; | |
810 | |
811 } else { | |
812 /* v1.1 and before only supported a few types, specifically coded | |
813 * per chunk type. | |
814 */ | |
815 switch (chunk->type) { | |
816 case ZTR_TYPE_SAMP: | |
817 case ZTR_TYPE_SMP4: | |
818 if (strcmp(key, "TYPE")) | |
819 return chunk->mdata; | |
820 break; | |
821 | |
822 default: | |
823 break; | |
824 } | |
825 } | |
826 | |
827 return NULL; | |
828 } | |
829 | |
830 /* | |
831 * Compresses an individual chunk using a specific format. The format is one | |
832 * of the 'format' fields listed in the spec; one of the ZTR_FORM_ macros. | |
833 */ | |
834 int compress_chunk(ztr_t *ztr, ztr_chunk_t *chunk, int format, | |
835 int option, int option2) { | |
836 char *new_data = NULL; | |
837 int new_len; | |
838 | |
839 switch (format) { | |
840 case ZTR_FORM_RAW: | |
841 return 0; | |
842 | |
843 case ZTR_FORM_RLE: | |
844 new_data = rle(chunk->data, chunk->dlength, option, &new_len); | |
845 if (entropy((unsigned char *)new_data, new_len) >= | |
846 entropy((unsigned char *)chunk->data, chunk->dlength)) { | |
847 xfree(new_data); | |
848 return 0; | |
849 } | |
850 break; | |
851 | |
852 case ZTR_FORM_XRLE: | |
853 new_data = xrle(chunk->data, chunk->dlength, option,option2, &new_len); | |
854 break; | |
855 | |
856 case ZTR_FORM_XRLE2: | |
857 new_data = xrle2(chunk->data, chunk->dlength, option, &new_len); | |
858 break; | |
859 | |
860 case ZTR_FORM_ZLIB: | |
861 new_data = zlib_huff(chunk->data, chunk->dlength, option, &new_len); | |
862 break; | |
863 | |
864 case ZTR_FORM_DELTA1: | |
865 new_data = decorrelate1(chunk->data, chunk->dlength, option, &new_len); | |
866 break; | |
867 | |
868 case ZTR_FORM_DDELTA1: | |
869 new_data = decorrelate1dyn(chunk->data, chunk->dlength, &new_len); | |
870 break; | |
871 | |
872 case ZTR_FORM_DELTA2: | |
873 new_data = decorrelate2(chunk->data, chunk->dlength, option, &new_len); | |
874 break; | |
875 | |
876 case ZTR_FORM_DDELTA2: | |
877 new_data = decorrelate2dyn(chunk->data, chunk->dlength, &new_len); | |
878 break; | |
879 | |
880 case ZTR_FORM_DELTA4: | |
881 new_data = decorrelate4(chunk->data, chunk->dlength, option, &new_len); | |
882 break; | |
883 | |
884 case ZTR_FORM_16TO8: | |
885 new_data = shrink_16to8(chunk->data, chunk->dlength, &new_len); | |
886 break; | |
887 | |
888 case ZTR_FORM_32TO8: | |
889 new_data = shrink_32to8(chunk->data, chunk->dlength, &new_len); | |
890 break; | |
891 | |
892 case ZTR_FORM_FOLLOW1: | |
893 new_data = follow1(chunk->data, chunk->dlength, &new_len); | |
894 break; | |
895 | |
896 case ZTR_FORM_ICHEB: | |
897 new_data = ichebcomp(chunk->data, chunk->dlength, &new_len); | |
898 break; | |
899 | |
900 case ZTR_FORM_LOG2: | |
901 new_data = log2_data(chunk->data, chunk->dlength, &new_len); | |
902 break; | |
903 | |
904 case ZTR_FORM_STHUFF: | |
905 new_data = sthuff(ztr, chunk->data, chunk->dlength, | |
906 option, option2, &new_len); | |
907 break; | |
908 | |
909 case ZTR_FORM_QSHIFT: | |
910 new_data = qshift(chunk->data, chunk->dlength, &new_len); | |
911 break; | |
912 | |
913 case ZTR_FORM_TSHIFT: | |
914 new_data = tshift(ztr, chunk->data, chunk->dlength, &new_len); | |
915 break; | |
916 } | |
917 | |
918 if (!new_data) { | |
919 fprintf(stderr, "!!ERROR!!\n"); | |
920 return -1; | |
921 } | |
922 | |
923 /* | |
924 fprintf(stderr, "Format %d => %d to %d\n", format, chunk->dlength, new_len); | |
925 */ | |
926 | |
927 chunk->dlength = new_len; | |
928 xfree(chunk->data); | |
929 chunk->data = new_data; | |
930 | |
931 return 0; | |
932 } | |
933 | |
934 /* | |
935 * Uncompresses an individual chunk from all levels of compression. | |
936 */ | |
937 int uncompress_chunk(ztr_t *ztr, ztr_chunk_t *chunk) { | |
938 char *new_data = NULL; | |
939 int new_len; | |
940 | |
941 while (chunk->dlength > 0 && chunk->data[0] != ZTR_FORM_RAW) { | |
942 switch (chunk->data[0]) { | |
943 case ZTR_FORM_RLE: | |
944 new_data = unrle(chunk->data, chunk->dlength, &new_len); | |
945 break; | |
946 | |
947 case ZTR_FORM_XRLE: | |
948 new_data = unxrle(chunk->data, chunk->dlength, &new_len); | |
949 break; | |
950 | |
951 case ZTR_FORM_XRLE2: | |
952 new_data = unxrle2(chunk->data, chunk->dlength, &new_len); | |
953 break; | |
954 | |
955 case ZTR_FORM_ZLIB: | |
956 new_data = zlib_dehuff(chunk->data, chunk->dlength, &new_len); | |
957 break; | |
958 | |
959 case ZTR_FORM_DELTA1: | |
960 new_data = recorrelate1(chunk->data, chunk->dlength, &new_len); | |
961 break; | |
962 | |
963 case ZTR_FORM_DELTA2: | |
964 new_data = recorrelate2(chunk->data, chunk->dlength, &new_len); | |
965 break; | |
966 | |
967 case ZTR_FORM_DELTA4: | |
968 new_data = recorrelate4(chunk->data, chunk->dlength, &new_len); | |
969 break; | |
970 | |
971 case ZTR_FORM_16TO8: | |
972 new_data = expand_8to16(chunk->data, chunk->dlength, &new_len); | |
973 break; | |
974 | |
975 case ZTR_FORM_32TO8: | |
976 new_data = expand_8to32(chunk->data, chunk->dlength, &new_len); | |
977 break; | |
978 | |
979 case ZTR_FORM_FOLLOW1: | |
980 new_data = unfollow1(chunk->data, chunk->dlength, &new_len); | |
981 break; | |
982 | |
983 case ZTR_FORM_ICHEB: | |
984 new_data = ichebuncomp(chunk->data, chunk->dlength, &new_len); | |
985 break; | |
986 | |
987 case ZTR_FORM_LOG2: | |
988 new_data = unlog2_data(chunk->data, chunk->dlength, &new_len); | |
989 break; | |
990 | |
991 case ZTR_FORM_STHUFF: | |
992 new_data = unsthuff(ztr, chunk->data, chunk->dlength, &new_len); | |
993 break; | |
994 | |
995 case ZTR_FORM_QSHIFT: | |
996 new_data = unqshift(chunk->data, chunk->dlength, &new_len); | |
997 break; | |
998 | |
999 case ZTR_FORM_TSHIFT: | |
1000 new_data = untshift(ztr, chunk->data, chunk->dlength, &new_len); | |
1001 break; | |
1002 | |
1003 default: | |
1004 fprintf(stderr, "Unknown encoding format %d\n", chunk->data[0]); | |
1005 return -1; | |
1006 } | |
1007 | |
1008 if (!new_data) | |
1009 return -1; | |
1010 | |
1011 /* | |
1012 fprintf(stderr, "format %d => %d to %d\n", | |
1013 chunk->data[0], chunk->dlength, new_len); | |
1014 */ | |
1015 | |
1016 chunk->dlength = new_len; | |
1017 xfree(chunk->data); | |
1018 chunk->data = new_data; | |
1019 } | |
1020 | |
1021 return 0; | |
1022 } | |
1023 | |
1024 /* | |
1025 * Compresses a ztr (in memory). | |
1026 * Level is 0, 1, 2 or 3 (no compression, delta, delta + zlib, | |
1027 * chebyshev + zlib). | |
1028 */ | |
1029 int compress_ztr(ztr_t *ztr, int level) { | |
1030 int i; | |
1031 | |
1032 if (0 == level) | |
1033 return 0; | |
1034 | |
1035 for (i = 0; i < ztr->nchunks; i++) { | |
1036 /* | |
1037 { | |
1038 char str[5]; | |
1039 fprintf(stderr, "---- %.4s ----\n", | |
1040 ZTR_BE2STR(ztr->chunk[i].type,str)); | |
1041 } | |
1042 fprintf(stderr, "Uncomp length=%d\n", ztr->chunk[i].dlength); | |
1043 */ | |
1044 | |
1045 switch(ztr->chunk[i].type) { | |
1046 char *type; | |
1047 case ZTR_TYPE_SAMP: | |
1048 case ZTR_TYPE_SMP4: | |
1049 #ifdef ILLUMINA_GA | |
1050 compress_chunk(ztr, &ztr->chunk[i], | |
1051 ZTR_FORM_STHUFF, CODE_TRACES, 0); | |
1052 #else | |
1053 type = ztr_lookup_mdata_value(ztr, &ztr->chunk[i], "TYPE"); | |
1054 if (type && 0 == strcmp(type, "PYRW")) { | |
1055 /* Raw data is not really compressable */ | |
1056 } else if (type && 0 == strcmp(type, "PYNO")) { | |
1057 if (level > 1) { | |
1058 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8, 0, 0); | |
1059 compress_chunk(ztr, &ztr->chunk[i], | |
1060 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0); | |
1061 } | |
1062 } else { | |
1063 if (level <= 2) { | |
1064 /* | |
1065 * Experiments show that typically a double delta does | |
1066 * better than a single delta for 8-bit data, and the other | |
1067 * way around for 16-bit data | |
1068 */ | |
1069 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA2, | |
1070 ztr->delta_level, 0); | |
1071 } else { | |
1072 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_ICHEB, 0, 0); | |
1073 } | |
1074 | |
1075 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_16TO8, 0, 0); | |
1076 if (level > 1) { | |
1077 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_FOLLOW1,0, 0); | |
1078 /* | |
1079 compress_chunk(ztr, &ztr->chunk[i], | |
1080 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY); | |
1081 */ | |
1082 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 150, 0); | |
1083 compress_chunk(ztr, &ztr->chunk[i], | |
1084 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0); | |
1085 } | |
1086 } | |
1087 #endif | |
1088 break; | |
1089 | |
1090 case ZTR_TYPE_BASE: | |
1091 #ifdef ILLUMINA_GA | |
1092 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_STHUFF, CODE_DNA, 0); | |
1093 #else | |
1094 if (level > 1) { | |
1095 compress_chunk(ztr, &ztr->chunk[i], | |
1096 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0); | |
1097 } | |
1098 #endif | |
1099 break; | |
1100 | |
1101 case ZTR_TYPE_CNF1: | |
1102 case ZTR_TYPE_CNF4: | |
1103 case ZTR_TYPE_CSID: | |
1104 #ifdef ILLUMINA_GA | |
1105 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 77, 0); | |
1106 compress_chunk(ztr, &ztr->chunk[i], | |
1107 ZTR_FORM_STHUFF, CODE_CONF_RLE, 0); | |
1108 #else | |
1109 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA1, 1, 0); | |
1110 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_RLE, 77, 0); | |
1111 if (level > 1) { | |
1112 compress_chunk(ztr, &ztr->chunk[i], | |
1113 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0); | |
1114 } | |
1115 #endif | |
1116 break; | |
1117 | |
1118 case ZTR_TYPE_BPOS: | |
1119 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_DELTA4, 1, 0); | |
1120 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_32TO8, 0, 0); | |
1121 if (level > 1) { | |
1122 compress_chunk(ztr, &ztr->chunk[i], | |
1123 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0); | |
1124 } | |
1125 break; | |
1126 | |
1127 case ZTR_TYPE_TEXT: | |
1128 #ifdef ILLUMINA_GA | |
1129 #else | |
1130 if (level > 1) { | |
1131 compress_chunk(ztr, &ztr->chunk[i], | |
1132 ZTR_FORM_ZLIB, Z_HUFFMAN_ONLY, 0); | |
1133 } | |
1134 #endif | |
1135 break; | |
1136 | |
1137 case ZTR_TYPE_FLWO: | |
1138 compress_chunk(ztr, &ztr->chunk[i], ZTR_FORM_XRLE, 0, 4); | |
1139 break; | |
1140 | |
1141 } | |
1142 | |
1143 /* | |
1144 fprintf(stderr, "Comp length=%d\n", ztr->chunk[i].dlength); | |
1145 */ | |
1146 } | |
1147 | |
1148 return 0; | |
1149 } | |
1150 | |
1151 /* | |
1152 * Uncompresses a ztr (in memory). | |
1153 */ | |
1154 int uncompress_ztr(ztr_t *ztr) { | |
1155 int i; | |
1156 | |
1157 for (i = 0; i < ztr->nchunks; i++) { | |
1158 /* | |
1159 { | |
1160 char str[5]; | |
1161 fprintf(stderr, "---- %.4s ----\n", | |
1162 ZTR_BE2STR(ztr->chunk[i].type,str)); | |
1163 } | |
1164 */ | |
1165 uncompress_chunk(ztr, &ztr->chunk[i]); | |
1166 } | |
1167 | |
1168 return 0; | |
1169 } | |
1170 |