comparison SNV/SNVMix2_source/SNVMix2-v0.12.1-rc1/samtools-0.1.6/bgzf.c @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:74f5ea818cea
1 /* The MIT License
2
3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
11
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 THE SOFTWARE.
22 */
23
24 /*
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
28
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 #include <fcntl.h>
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include "bgzf.h"
37
38 #include "khash.h"
39 typedef struct {
40 int size;
41 uint8_t *block;
42 int64_t end_offset;
43 } cache_t;
44 KHASH_MAP_INIT_INT64(cache, cache_t)
45
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
49 #else
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
52 #endif
53
54 typedef int8_t bgzf_byte_t;
55
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
58
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
61
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
71
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
74
75
76 inline
77 void
78 packInt16(uint8_t* buffer, uint16_t value)
79 {
80 buffer[0] = value;
81 buffer[1] = value >> 8;
82 }
83
84 inline
85 int
86 unpackInt16(const uint8_t* buffer)
87 {
88 return (buffer[0] | (buffer[1] << 8));
89 }
90
91 inline
92 void
93 packInt32(uint8_t* buffer, uint32_t value)
94 {
95 buffer[0] = value;
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
99 }
100
101 static inline
102 int
103 bgzf_min(int x, int y)
104 {
105 return (x < y) ? x : y;
106 }
107
108 static
109 void
110 report_error(BGZF* fp, const char* message) {
111 fp->error = message;
112 }
113
114 static BGZF *bgzf_read_init()
115 {
116 BGZF *fp;
117 fp = calloc(1, sizeof(BGZF));
118 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
119 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
120 fp->compressed_block_size = MAX_BLOCK_SIZE;
121 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
122 fp->cache_size = 0;
123 fp->cache = kh_init(cache);
124 return fp;
125 }
126
127 static
128 BGZF*
129 open_read(int fd)
130 {
131 #ifdef _USE_KNETFILE
132 knetFile *file = knet_dopen(fd, "r");
133 #else
134 FILE* file = fdopen(fd, "r");
135 #endif
136 BGZF* fp;
137 if (file == 0) return 0;
138 fp = bgzf_read_init();
139 fp->file_descriptor = fd;
140 fp->open_mode = 'r';
141 #ifdef _USE_KNETFILE
142 fp->x.fpr = file;
143 #else
144 fp->file = file;
145 #endif
146 return fp;
147 }
148
149 static
150 BGZF*
151 open_write(int fd, bool is_uncompressed)
152 {
153 FILE* file = fdopen(fd, "w");
154 BGZF* fp;
155 if (file == 0) return 0;
156 fp = malloc(sizeof(BGZF));
157 fp->file_descriptor = fd;
158 fp->open_mode = 'w';
159 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
160 #ifdef _USE_KNETFILE
161 fp->x.fpw = file;
162 #else
163 fp->file = file;
164 #endif
165 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
166 fp->uncompressed_block = NULL;
167 fp->compressed_block_size = MAX_BLOCK_SIZE;
168 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
169 fp->block_address = 0;
170 fp->block_offset = 0;
171 fp->block_length = 0;
172 fp->error = NULL;
173 return fp;
174 }
175
176 BGZF*
177 bgzf_open(const char* __restrict path, const char* __restrict mode)
178 {
179 BGZF* fp = NULL;
180 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
181 #ifdef _USE_KNETFILE
182 knetFile *file = knet_open(path, mode);
183 if (file == 0) return 0;
184 fp = bgzf_read_init();
185 fp->file_descriptor = -1;
186 fp->open_mode = 'r';
187 fp->x.fpr = file;
188 #else
189 int fd, oflag = O_RDONLY;
190 #ifdef _WIN32
191 oflag |= O_BINARY;
192 #endif
193 fd = open(path, oflag);
194 if (fd == -1) return 0;
195 fp = open_read(fd);
196 #endif
197 } else if (mode[0] == 'w' || mode[0] == 'W') {
198 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
199 #ifdef _WIN32
200 oflag |= O_BINARY;
201 #endif
202 fd = open(path, oflag, 0644);
203 if (fd == -1) return 0;
204 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
205 }
206 if (fp != NULL) {
207 fp->owned_file = 1;
208 }
209 return fp;
210 }
211
212 BGZF*
213 bgzf_fdopen(int fd, const char * __restrict mode)
214 {
215 if (fd == -1) return 0;
216 if (mode[0] == 'r' || mode[0] == 'R') {
217 return open_read(fd);
218 } else if (mode[0] == 'w' || mode[0] == 'W') {
219 return open_write(fd, strstr(mode, "u")? 1 : 0);
220 } else {
221 return NULL;
222 }
223 }
224
225 static
226 int
227 deflate_block(BGZF* fp, int block_length)
228 {
229 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
230 // Also adds an extra field that stores the compressed block length.
231
232 bgzf_byte_t* buffer = fp->compressed_block;
233 int buffer_size = fp->compressed_block_size;
234
235 // Init gzip header
236 buffer[0] = GZIP_ID1;
237 buffer[1] = GZIP_ID2;
238 buffer[2] = CM_DEFLATE;
239 buffer[3] = FLG_FEXTRA;
240 buffer[4] = 0; // mtime
241 buffer[5] = 0;
242 buffer[6] = 0;
243 buffer[7] = 0;
244 buffer[8] = 0;
245 buffer[9] = OS_UNKNOWN;
246 buffer[10] = BGZF_XLEN;
247 buffer[11] = 0;
248 buffer[12] = BGZF_ID1;
249 buffer[13] = BGZF_ID2;
250 buffer[14] = BGZF_LEN;
251 buffer[15] = 0;
252 buffer[16] = 0; // placeholder for block length
253 buffer[17] = 0;
254
255 // loop to retry for blocks that do not compress enough
256 int input_length = block_length;
257 int compressed_length = 0;
258 while (1) {
259 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
260 z_stream zs;
261 zs.zalloc = NULL;
262 zs.zfree = NULL;
263 zs.next_in = fp->uncompressed_block;
264 zs.avail_in = input_length;
265 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
266 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
267
268 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
269 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
270 if (status != Z_OK) {
271 report_error(fp, "deflate init failed");
272 return -1;
273 }
274 status = deflate(&zs, Z_FINISH);
275 if (status != Z_STREAM_END) {
276 deflateEnd(&zs);
277 if (status == Z_OK) {
278 // Not enough space in buffer.
279 // Can happen in the rare case the input doesn't compress enough.
280 // Reduce the amount of input until it fits.
281 input_length -= 1024;
282 if (input_length <= 0) {
283 // should never happen
284 report_error(fp, "input reduction failed");
285 return -1;
286 }
287 continue;
288 }
289 report_error(fp, "deflate failed");
290 return -1;
291 }
292 status = deflateEnd(&zs);
293 if (status != Z_OK) {
294 report_error(fp, "deflate end failed");
295 return -1;
296 }
297 compressed_length = zs.total_out;
298 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
299 if (compressed_length > MAX_BLOCK_SIZE) {
300 // should never happen
301 report_error(fp, "deflate overflow");
302 return -1;
303 }
304 break;
305 }
306
307 packInt16((uint8_t*)&buffer[16], compressed_length-1);
308 uint32_t crc = crc32(0L, NULL, 0L);
309 crc = crc32(crc, fp->uncompressed_block, input_length);
310 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
311 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
312
313 int remaining = block_length - input_length;
314 if (remaining > 0) {
315 if (remaining > input_length) {
316 // should never happen (check so we can use memcpy)
317 report_error(fp, "remainder too large");
318 return -1;
319 }
320 memcpy(fp->uncompressed_block,
321 fp->uncompressed_block + input_length,
322 remaining);
323 }
324 fp->block_offset = remaining;
325 return compressed_length;
326 }
327
328 static
329 int
330 inflate_block(BGZF* fp, int block_length)
331 {
332 // Inflate the block in fp->compressed_block into fp->uncompressed_block
333
334 z_stream zs;
335 zs.zalloc = NULL;
336 zs.zfree = NULL;
337 zs.next_in = fp->compressed_block + 18;
338 zs.avail_in = block_length - 16;
339 zs.next_out = fp->uncompressed_block;
340 zs.avail_out = fp->uncompressed_block_size;
341
342 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
343 if (status != Z_OK) {
344 report_error(fp, "inflate init failed");
345 return -1;
346 }
347 status = inflate(&zs, Z_FINISH);
348 if (status != Z_STREAM_END) {
349 inflateEnd(&zs);
350 report_error(fp, "inflate failed");
351 return -1;
352 }
353 status = inflateEnd(&zs);
354 if (status != Z_OK) {
355 report_error(fp, "inflate failed");
356 return -1;
357 }
358 return zs.total_out;
359 }
360
361 static
362 int
363 check_header(const bgzf_byte_t* header)
364 {
365 return (header[0] == GZIP_ID1 &&
366 header[1] == (bgzf_byte_t) GZIP_ID2 &&
367 header[2] == Z_DEFLATED &&
368 (header[3] & FLG_FEXTRA) != 0 &&
369 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
370 header[12] == BGZF_ID1 &&
371 header[13] == BGZF_ID2 &&
372 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
373 }
374
375 static void free_cache(BGZF *fp)
376 {
377 khint_t k;
378 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
379 if (fp->open_mode != 'r') return;
380 for (k = kh_begin(h); k < kh_end(h); ++k)
381 if (kh_exist(h, k)) free(kh_val(h, k).block);
382 kh_destroy(cache, h);
383 }
384
385 static int load_block_from_cache(BGZF *fp, int64_t block_address)
386 {
387 khint_t k;
388 cache_t *p;
389 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
390 k = kh_get(cache, h, block_address);
391 if (k == kh_end(h)) return 0;
392 p = &kh_val(h, k);
393 if (fp->block_length != 0) fp->block_offset = 0;
394 fp->block_address = block_address;
395 fp->block_length = p->size;
396 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
397 #ifdef _USE_KNETFILE
398 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
399 #else
400 fseeko(fp->file, p->end_offset, SEEK_SET);
401 #endif
402 return p->size;
403 }
404
405 static void cache_block(BGZF *fp, int size)
406 {
407 int ret;
408 khint_t k;
409 cache_t *p;
410 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
411 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
412 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
413 /* A better way would be to remove the oldest block in the
414 * cache, but here we remove a random one for simplicity. This
415 * should not have a big impact on performance. */
416 for (k = kh_begin(h); k < kh_end(h); ++k)
417 if (kh_exist(h, k)) break;
418 if (k < kh_end(h)) {
419 free(kh_val(h, k).block);
420 kh_del(cache, h, k);
421 }
422 }
423 k = kh_put(cache, h, fp->block_address, &ret);
424 if (ret == 0) return; // if this happens, a bug!
425 p = &kh_val(h, k);
426 p->size = fp->block_length;
427 p->end_offset = fp->block_address + size;
428 p->block = malloc(MAX_BLOCK_SIZE);
429 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
430 }
431
432 static
433 int
434 read_block(BGZF* fp)
435 {
436 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
437 int size = 0;
438 #ifdef _USE_KNETFILE
439 int64_t block_address = knet_tell(fp->x.fpr);
440 if (load_block_from_cache(fp, block_address)) return 0;
441 int count = knet_read(fp->x.fpr, header, sizeof(header));
442 #else
443 int64_t block_address = ftello(fp->file);
444 if (load_block_from_cache(fp, block_address)) return 0;
445 int count = fread(header, 1, sizeof(header), fp->file);
446 #endif
447 if (count == 0) {
448 fp->block_length = 0;
449 return 0;
450 }
451 size = count;
452 if (count != sizeof(header)) {
453 report_error(fp, "read failed");
454 return -1;
455 }
456 if (!check_header(header)) {
457 report_error(fp, "invalid block header");
458 return -1;
459 }
460 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
461 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
462 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
463 int remaining = block_length - BLOCK_HEADER_LENGTH;
464 #ifdef _USE_KNETFILE
465 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
466 #else
467 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
468 #endif
469 if (count != remaining) {
470 report_error(fp, "read failed");
471 return -1;
472 }
473 size += count;
474 count = inflate_block(fp, block_length);
475 if (count < 0) {
476 return -1;
477 }
478 if (fp->block_length != 0) {
479 // Do not reset offset if this read follows a seek.
480 fp->block_offset = 0;
481 }
482 fp->block_address = block_address;
483 fp->block_length = count;
484 cache_block(fp, size);
485 return 0;
486 }
487
488 int
489 bgzf_read(BGZF* fp, void* data, int length)
490 {
491 if (length <= 0) {
492 return 0;
493 }
494 if (fp->open_mode != 'r') {
495 report_error(fp, "file not open for reading");
496 return -1;
497 }
498
499 int bytes_read = 0;
500 bgzf_byte_t* output = data;
501 while (bytes_read < length) {
502 int available = fp->block_length - fp->block_offset;
503 if (available <= 0) {
504 if (read_block(fp) != 0) {
505 return -1;
506 }
507 available = fp->block_length - fp->block_offset;
508 if (available <= 0) {
509 break;
510 }
511 }
512 int copy_length = bgzf_min(length-bytes_read, available);
513 bgzf_byte_t* buffer = fp->uncompressed_block;
514 memcpy(output, buffer + fp->block_offset, copy_length);
515 fp->block_offset += copy_length;
516 output += copy_length;
517 bytes_read += copy_length;
518 }
519 if (fp->block_offset == fp->block_length) {
520 #ifdef _USE_KNETFILE
521 fp->block_address = knet_tell(fp->x.fpr);
522 #else
523 fp->block_address = ftello(fp->file);
524 #endif
525 fp->block_offset = 0;
526 fp->block_length = 0;
527 }
528 return bytes_read;
529 }
530
531 static
532 int
533 flush_block(BGZF* fp)
534 {
535 while (fp->block_offset > 0) {
536 int block_length = deflate_block(fp, fp->block_offset);
537 if (block_length < 0) {
538 return -1;
539 }
540 #ifdef _USE_KNETFILE
541 int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
542 #else
543 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
544 #endif
545 if (count != block_length) {
546 report_error(fp, "write failed");
547 return -1;
548 }
549 fp->block_address += block_length;
550 }
551 return 0;
552 }
553
554 int
555 bgzf_write(BGZF* fp, const void* data, int length)
556 {
557 if (fp->open_mode != 'w') {
558 report_error(fp, "file not open for writing");
559 return -1;
560 }
561
562 if (fp->uncompressed_block == NULL) {
563 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
564 }
565
566 const bgzf_byte_t* input = data;
567 int block_length = fp->uncompressed_block_size;
568 int bytes_written = 0;
569 while (bytes_written < length) {
570 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
571 bgzf_byte_t* buffer = fp->uncompressed_block;
572 memcpy(buffer + fp->block_offset, input, copy_length);
573 fp->block_offset += copy_length;
574 input += copy_length;
575 bytes_written += copy_length;
576 if (fp->block_offset == block_length) {
577 if (flush_block(fp) != 0) {
578 break;
579 }
580 }
581 }
582 return bytes_written;
583 }
584
585 int
586 bgzf_close(BGZF* fp)
587 {
588 if (fp->open_mode == 'w') {
589 if (flush_block(fp) != 0) {
590 return -1;
591 }
592 { // add an empty block
593 int count, block_length = deflate_block(fp, 0);
594 #ifdef _USE_KNETFILE
595 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
596 #else
597 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
598 #endif
599 }
600 #ifdef _USE_KNETFILE
601 if (fflush(fp->x.fpw) != 0) {
602 #else
603 if (fflush(fp->file) != 0) {
604 #endif
605 report_error(fp, "flush failed");
606 return -1;
607 }
608 }
609 if (fp->owned_file) {
610 #ifdef _USE_KNETFILE
611 int ret;
612 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
613 else ret = knet_close(fp->x.fpr);
614 if (ret != 0) return -1;
615 #else
616 if (fclose(fp->file) != 0) {
617 return -1;
618 }
619 #endif
620 }
621 free(fp->uncompressed_block);
622 free(fp->compressed_block);
623 free_cache(fp);
624 free(fp);
625 return 0;
626 }
627
628 int64_t
629 bgzf_tell(BGZF* fp)
630 {
631 return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
632 }
633
634 void bgzf_set_cache_size(BGZF *fp, int cache_size)
635 {
636 if (fp) fp->cache_size = cache_size;
637 }
638
639 int bgzf_check_EOF(BGZF *fp)
640 {
641 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
642 uint8_t buf[28];
643 off_t offset;
644 #ifdef _USE_KNETFILE
645 offset = knet_tell(fp->x.fpr);
646 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
647 knet_read(fp->x.fpr, buf, 28);
648 knet_seek(fp->x.fpr, offset, SEEK_SET);
649 #else
650 offset = ftello(fp->file);
651 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
652 fread(buf, 1, 28, fp->file);
653 fseeko(fp->file, offset, SEEK_SET);
654 #endif
655 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
656 }
657
658 int64_t
659 bgzf_seek(BGZF* fp, int64_t pos, int where)
660 {
661 if (fp->open_mode != 'r') {
662 report_error(fp, "file not open for read");
663 return -1;
664 }
665 if (where != SEEK_SET) {
666 report_error(fp, "unimplemented seek option");
667 return -1;
668 }
669 int block_offset = pos & 0xFFFF;
670 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
671 #ifdef _USE_KNETFILE
672 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
673 #else
674 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
675 #endif
676 report_error(fp, "seek failed");
677 return -1;
678 }
679 fp->block_length = 0; // indicates current block is not loaded
680 fp->block_address = block_address;
681 fp->block_offset = block_offset;
682 return 0;
683 }