comparison ezBAMQC/src/htslib/cram/cram_decode.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * - In-memory decoding of CRAM data structures.
33 * - Iterator for reading CRAM record by record.
34 */
35
36 #ifdef HAVE_CONFIG_H
37 #include "io_lib_config.h"
38 #endif
39
40 #include <stdio.h>
41 #include <errno.h>
42 #include <assert.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <zlib.h>
46 #include <sys/types.h>
47 #include <sys/stat.h>
48 #include <math.h>
49 #include <ctype.h>
50
51 #include "cram/cram.h"
52 #include "cram/os.h"
53 #include "cram/md5.h"
54
55 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
56 //#define USE_X
57
58 /* ----------------------------------------------------------------------
59 * CRAM compression headers
60 */
61
62 /*
63 * Decodes the Tag Dictionary record in the preservation map
64 * Updates the cram compression header.
65 *
66 * Returns number of bytes decoded on success
67 * -1 on failure
68 */
69 int cram_decode_TD(char *cp, cram_block_compression_hdr *h) {
70 char *op = cp;
71 unsigned char *dat;
72 cram_block *b;
73 int32_t blk_size;
74 int nTL, i, sz;
75
76 if (!(b = cram_new_block(0, 0)))
77 return -1;
78 h->TD_blk = b;
79
80 /* Decode */
81 cp += itf8_get(cp, &blk_size);
82 if (!blk_size) {
83 h->nTL = 0;
84 h->TL = NULL;
85 cram_free_block(b);
86 return cp - op;
87 }
88
89 BLOCK_APPEND(b, cp, blk_size);
90 cp += blk_size;
91 sz = cp - op;
92
93 // Force nul termination if missing
94 if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
95 BLOCK_APPEND_CHAR(b, '\0');
96
97 /* Set up TL lookup table */
98 dat = BLOCK_DATA(b);
99
100 // Count
101 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
102 nTL++;
103 while (dat[i])
104 i++;
105 }
106
107 // Copy
108 h->nTL = nTL;
109 if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *))))
110 return -1;
111 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
112 h->TL[nTL++] = &dat[i];
113 while (dat[i])
114 i++;
115 }
116
117 return sz;
118 }
119
120 /*
121 * Decodes a CRAM block compression header.
122 * Returns header ptr on success
123 * NULL on failure
124 */
125 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
126 cram_block *b) {
127 char *cp, *cp_copy;
128 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
129 int i;
130 int32_t map_size, map_count;
131
132 if (!hdr)
133 return NULL;
134
135 if (b->method != RAW) {
136 if (cram_uncompress_block(b)) {
137 free(hdr);
138 return NULL;
139 }
140 }
141
142 cp = (char *)b->data;
143
144 if (CRAM_MAJOR_VERS(fd->version) == 1) {
145 cp += itf8_get(cp, &hdr->ref_seq_id);
146 cp += itf8_get(cp, &hdr->ref_seq_start);
147 cp += itf8_get(cp, &hdr->ref_seq_span);
148 cp += itf8_get(cp, &hdr->num_records);
149 cp += itf8_get(cp, &hdr->num_landmarks);
150 if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
151 free(hdr);
152 return NULL;
153 }
154 for (i = 0; i < hdr->num_landmarks; i++) {
155 cp += itf8_get(cp, &hdr->landmark[i]);
156 }
157 }
158
159 hdr->preservation_map = kh_init(map);
160
161 memset(hdr->rec_encoding_map, 0,
162 CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
163 memset(hdr->tag_encoding_map, 0,
164 CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
165
166 if (!hdr->preservation_map) {
167 cram_free_compression_header(hdr);
168 return NULL;
169 }
170
171 /* Initialise defaults for preservation map */
172 hdr->mapped_qs_included = 0;
173 hdr->unmapped_qs_included = 0;
174 hdr->unmapped_placed = 0;
175 hdr->qs_included = 0;
176 hdr->read_names_included = 0;
177 hdr->AP_delta = 1;
178 memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
179
180 /* Preservation map */
181 cp += itf8_get(cp, &map_size); cp_copy = cp;
182 cp += itf8_get(cp, &map_count);
183 for (i = 0; i < map_count; i++) {
184 pmap_t hd;
185 khint_t k;
186 int r;
187
188 cp += 2;
189 switch(CRAM_KEY(cp[-2],cp[-1])) {
190 case CRAM_KEY('M','I'):
191 hd.i = *cp++;
192 k = kh_put(map, hdr->preservation_map, "MI", &r);
193 if (-1 == r) {
194 cram_free_compression_header(hdr);
195 return NULL;
196 }
197
198 kh_val(hdr->preservation_map, k) = hd;
199 hdr->mapped_qs_included = hd.i;
200 break;
201
202 case CRAM_KEY('U','I'):
203 hd.i = *cp++;
204 k = kh_put(map, hdr->preservation_map, "UI", &r);
205 if (-1 == r) {
206 cram_free_compression_header(hdr);
207 return NULL;
208 }
209
210 kh_val(hdr->preservation_map, k) = hd;
211 hdr->unmapped_qs_included = hd.i;
212 break;
213
214 case CRAM_KEY('P','I'):
215 hd.i = *cp++;
216 k = kh_put(map, hdr->preservation_map, "PI", &r);
217 if (-1 == r) {
218 cram_free_compression_header(hdr);
219 return NULL;
220 }
221
222 kh_val(hdr->preservation_map, k) = hd;
223 hdr->unmapped_placed = hd.i;
224 break;
225
226 case CRAM_KEY('R','N'):
227 hd.i = *cp++;
228 k = kh_put(map, hdr->preservation_map, "RN", &r);
229 if (-1 == r) {
230 cram_free_compression_header(hdr);
231 return NULL;
232 }
233
234 kh_val(hdr->preservation_map, k) = hd;
235 hdr->read_names_included = hd.i;
236 break;
237
238 case CRAM_KEY('A','P'):
239 hd.i = *cp++;
240 k = kh_put(map, hdr->preservation_map, "AP", &r);
241 if (-1 == r) {
242 cram_free_compression_header(hdr);
243 return NULL;
244 }
245
246 kh_val(hdr->preservation_map, k) = hd;
247 hdr->AP_delta = hd.i;
248 break;
249
250 case CRAM_KEY('R','R'):
251 hd.i = *cp++;
252 k = kh_put(map, hdr->preservation_map, "RR", &r);
253 if (-1 == r) {
254 cram_free_compression_header(hdr);
255 return NULL;
256 }
257
258 kh_val(hdr->preservation_map, k) = hd;
259 fd->no_ref = !hd.i;
260 break;
261
262 case CRAM_KEY('S','M'):
263 hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
264 hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
265 hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
266 hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
267
268 hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
269 hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
270 hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
271 hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
272
273 hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
274 hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
275 hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
276 hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
277
278 hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
279 hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
280 hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
281 hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
282
283 hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
284 hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
285 hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
286 hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
287
288 hd.p = cp;
289 cp += 5;
290
291 k = kh_put(map, hdr->preservation_map, "SM", &r);
292 if (-1 == r) {
293 cram_free_compression_header(hdr);
294 return NULL;
295 }
296 kh_val(hdr->preservation_map, k) = hd;
297 break;
298
299 case CRAM_KEY('T','D'): {
300 int sz = cram_decode_TD(cp, hdr); // tag dictionary
301 if (sz < 0) {
302 cram_free_compression_header(hdr);
303 return NULL;
304 }
305
306 hd.p = cp;
307 cp += sz;
308
309 k = kh_put(map, hdr->preservation_map, "TD", &r);
310 if (-1 == r) {
311 cram_free_compression_header(hdr);
312 return NULL;
313 }
314 kh_val(hdr->preservation_map, k) = hd;
315 break;
316 }
317
318 default:
319 fprintf(stderr, "Unrecognised preservation map key %c%c\n",
320 cp[-2], cp[-1]);
321 // guess byte;
322 cp++;
323 break;
324 }
325 }
326 if (cp - cp_copy != map_size) {
327 cram_free_compression_header(hdr);
328 return NULL;
329 }
330
331 /* Record encoding map */
332 cp += itf8_get(cp, &map_size); cp_copy = cp;
333 cp += itf8_get(cp, &map_count);
334 for (i = 0; i < map_count; i++) {
335 char *key = cp;
336 int32_t encoding;
337 int32_t size;
338 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
339
340 if (!m) {
341 cram_free_compression_header(hdr);
342 return NULL;
343 }
344
345 cp += 2;
346 cp += itf8_get(cp, &encoding);
347 cp += itf8_get(cp, &size);
348
349 // Fill out cram_map purely for cram_dump to dump out.
350 m->key = (key[0]<<8)|key[1];
351 m->encoding = encoding;
352 m->size = size;
353 m->offset = cp - (char *)b->data;
354 m->codec = NULL;
355
356 if (m->encoding == E_NULL)
357 continue;
358
359 //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
360
361 /*
362 * For CRAM1.0 CF and BF are Byte and not Int.
363 * Practically speaking it makes no difference unless we have a
364 * 1.0 format file that stores these in EXTERNAL as only then
365 * does Byte vs Int matter.
366 *
367 * Neither this C code nor Java reference implementations did this,
368 * so we gloss over it and treat them as int.
369 */
370
371 if (key[0] == 'B' && key[1] == 'F') {
372 if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
373 E_INT,
374 fd->version))) {
375 cram_free_compression_header(hdr);
376 return NULL;
377 }
378 } else if (key[0] == 'C' && key[1] == 'F') {
379 if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
380 E_INT,
381 fd->version))) {
382 cram_free_compression_header(hdr);
383 return NULL;
384 }
385 } else if (key[0] == 'R' && key[1] == 'I') {
386 if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
387 E_INT,
388 fd->version))) {
389 cram_free_compression_header(hdr);
390 return NULL;
391 }
392 } else if (key[0] == 'R' && key[1] == 'L') {
393 if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
394 E_INT,
395 fd->version))) {
396 cram_free_compression_header(hdr);
397 return NULL;
398 }
399 } else if (key[0] == 'A' && key[1] == 'P') {
400 if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
401 E_INT,
402 fd->version))) {
403 cram_free_compression_header(hdr);
404 return NULL;
405 }
406 } else if (key[0] == 'R' && key[1] == 'G') {
407 if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
408 E_INT,
409 fd->version))) {
410 cram_free_compression_header(hdr);
411 return NULL;
412 }
413 } else if (key[0] == 'M' && key[1] == 'F') {
414 if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
415 E_INT,
416 fd->version))) {
417 cram_free_compression_header(hdr);
418 return NULL;
419 }
420 } else if (key[0] == 'N' && key[1] == 'S') {
421 if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
422 E_INT,
423 fd->version))) {
424 cram_free_compression_header(hdr);
425 return NULL;
426 }
427 } else if (key[0] == 'N' && key[1] == 'P') {
428 if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
429 E_INT,
430 fd->version))) {
431 cram_free_compression_header(hdr);
432 return NULL;
433 }
434 } else if (key[0] == 'T' && key[1] == 'S') {
435 if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
436 E_INT,
437 fd->version))) {
438 cram_free_compression_header(hdr);
439 return NULL;
440 }
441 } else if (key[0] == 'N' && key[1] == 'F') {
442 if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
443 E_INT,
444 fd->version))) {
445 cram_free_compression_header(hdr);
446 return NULL;
447 }
448 } else if (key[0] == 'T' && key[1] == 'C') {
449 if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
450 E_BYTE,
451 fd->version))) {
452 cram_free_compression_header(hdr);
453 return NULL;
454 }
455 } else if (key[0] == 'T' && key[1] == 'N') {
456 if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
457 E_INT,
458 fd->version))) {
459 cram_free_compression_header(hdr);
460 return NULL;
461 }
462 } else if (key[0] == 'F' && key[1] == 'N') {
463 if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
464 E_INT,
465 fd->version))) {
466 cram_free_compression_header(hdr);
467 return NULL;
468 }
469 } else if (key[0] == 'F' && key[1] == 'C') {
470 if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
471 E_BYTE,
472 fd->version))) {
473 cram_free_compression_header(hdr);
474 return NULL;
475 }
476 } else if (key[0] == 'F' && key[1] == 'P') {
477 if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
478 E_INT,
479 fd->version))) {
480 cram_free_compression_header(hdr);
481 return NULL;
482 }
483 } else if (key[0] == 'B' && key[1] == 'S') {
484 if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
485 E_BYTE,
486 fd->version))) {
487 cram_free_compression_header(hdr);
488 return NULL;
489 }
490 } else if (key[0] == 'I' && key[1] == 'N') {
491 if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
492 E_BYTE_ARRAY,
493 fd->version))) {
494 cram_free_compression_header(hdr);
495 return NULL;
496 }
497 } else if (key[0] == 'S' && key[1] == 'C') {
498 if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
499 E_BYTE_ARRAY,
500 fd->version))) {
501 cram_free_compression_header(hdr);
502 return NULL;
503 }
504 } else if (key[0] == 'D' && key[1] == 'L') {
505 if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
506 E_INT,
507 fd->version))) {
508 cram_free_compression_header(hdr);
509 return NULL;
510 }
511 } else if (key[0] == 'B' && key[1] == 'A') {
512 if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
513 E_BYTE,
514 fd->version))) {
515 cram_free_compression_header(hdr);
516 return NULL;
517 }
518 } else if (key[0] == 'B' && key[1] == 'B') {
519 if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
520 E_BYTE_ARRAY,
521 fd->version))) {
522 cram_free_compression_header(hdr);
523 return NULL;
524 }
525 } else if (key[0] == 'R' && key[1] == 'S') {
526 if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
527 E_INT,
528 fd->version))) {
529 cram_free_compression_header(hdr);
530 return NULL;
531 }
532 } else if (key[0] == 'P' && key[1] == 'D') {
533 if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
534 E_INT,
535 fd->version))) {
536 cram_free_compression_header(hdr);
537 return NULL;
538 }
539 } else if (key[0] == 'H' && key[1] == 'C') {
540 if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
541 E_INT,
542 fd->version))) {
543 cram_free_compression_header(hdr);
544 return NULL;
545 }
546 } else if (key[0] == 'M' && key[1] == 'Q') {
547 if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
548 E_INT,
549 fd->version))) {
550 cram_free_compression_header(hdr);
551 return NULL;
552 }
553 } else if (key[0] == 'R' && key[1] == 'N') {
554 if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
555 E_BYTE_ARRAY_BLOCK,
556 fd->version))) {
557 cram_free_compression_header(hdr);
558 return NULL;
559 }
560 } else if (key[0] == 'Q' && key[1] == 'S') {
561 if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
562 E_BYTE,
563 fd->version))) {
564 cram_free_compression_header(hdr);
565 return NULL;
566 }
567 } else if (key[0] == 'Q' && key[1] == 'Q') {
568 if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
569 E_BYTE_ARRAY,
570 fd->version))) {
571 cram_free_compression_header(hdr);
572 return NULL;
573 }
574 } else if (key[0] == 'T' && key[1] == 'L') {
575 if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
576 E_INT,
577 fd->version))) {
578 cram_free_compression_header(hdr);
579 return NULL;
580 }
581 } else if (key[0] == 'T' && key[1] == 'M') {
582 } else if (key[0] == 'T' && key[1] == 'V') {
583 } else
584 fprintf(stderr, "Unrecognised key: %.2s\n", key);
585
586 cp += size;
587
588 m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
589 hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
590 }
591 if (cp - cp_copy != map_size) {
592 cram_free_compression_header(hdr);
593 return NULL;
594 }
595
596 /* Tag encoding map */
597 cp += itf8_get(cp, &map_size); cp_copy = cp;
598 cp += itf8_get(cp, &map_count);
599 for (i = 0; i < map_count; i++) {
600 int32_t encoding;
601 int32_t size;
602 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
603 char *key = cp+1;
604
605 if (!m) {
606 cram_free_compression_header(hdr);
607 return NULL;
608 }
609
610 m->key = (key[0]<<16)|(key[1]<<8)|key[2];
611
612 cp += 4; // Strictly ITF8, but this suffices
613 cp += itf8_get(cp, &encoding);
614 cp += itf8_get(cp, &size);
615
616 m->encoding = encoding;
617 m->size = size;
618 m->offset = cp - (char *)b->data;
619 if (!(m->codec = cram_decoder_init(encoding, cp, size,
620 E_BYTE_ARRAY_BLOCK, fd->version))) {
621 cram_free_compression_header(hdr);
622 free(m);
623 return NULL;
624 }
625
626 cp += size;
627
628 m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
629 hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
630 }
631 if (cp - cp_copy != map_size) {
632 cram_free_compression_header(hdr);
633 return NULL;
634 }
635
636 return hdr;
637 }
638
639 /*
640 * Note we also need to scan through the record encoding map to
641 * see which data series share the same block, either external or
642 * CORE. For example if we need the BF data series but MQ and CF
643 * are also encoded in the same block then we need to add those in
644 * as a dependency in order to correctly decode BF.
645 *
646 * Returns 0 on success
647 * -1 on failure
648 */
649 int cram_dependent_data_series(cram_fd *fd,
650 cram_block_compression_hdr *hdr,
651 cram_slice *s) {
652 int *block_used;
653 int core_used = 0;
654 int i;
655 static int i_to_id[] = {
656 DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
657 DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
658 DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
659 DS_HC, DS_SC, DS_BB, DS_QQ,
660 };
661 uint32_t orig_ds;
662
663 /*
664 * Set the data_series bit field based on fd->required_fields
665 * contents.
666 */
667 if (fd->required_fields && fd->required_fields != INT_MAX) {
668 hdr->data_series = 0;
669
670 if (fd->required_fields & SAM_QNAME)
671 hdr->data_series |= CRAM_RN;
672
673 if (fd->required_fields & SAM_FLAG)
674 hdr->data_series |= CRAM_BF;
675
676 if (fd->required_fields & SAM_RNAME)
677 hdr->data_series |= CRAM_RI | CRAM_BF;
678
679 if (fd->required_fields & SAM_POS)
680 hdr->data_series |= CRAM_AP | CRAM_BF;
681
682 if (fd->required_fields & SAM_MAPQ)
683 hdr->data_series |= CRAM_MQ;
684
685 if (fd->required_fields & SAM_CIGAR)
686 hdr->data_series |= CRAM_CIGAR;
687
688 if (fd->required_fields & SAM_RNEXT)
689 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
690
691 if (fd->required_fields & SAM_PNEXT)
692 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
693
694 if (fd->required_fields & SAM_TLEN)
695 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
696 CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
697
698 if (fd->required_fields & SAM_SEQ)
699 hdr->data_series |= CRAM_SEQ;
700
701 if (!(fd->required_fields & SAM_AUX))
702 // No easy way to get MD/NM without other tags at present
703 fd->decode_md = 0;
704
705 if (fd->required_fields & SAM_QUAL)
706 hdr->data_series |= CRAM_SEQ;
707
708 if (fd->required_fields & SAM_AUX)
709 hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
710
711 if (fd->required_fields & SAM_RGAUX)
712 hdr->data_series |= CRAM_RG | CRAM_BF;
713
714 // Always uncompress CORE block
715 if (cram_uncompress_block(s->block[0]))
716 return -1;
717 } else {
718 hdr->data_series = CRAM_ALL;
719
720 for (i = 0; i < s->hdr->num_blocks; i++) {
721 if (cram_uncompress_block(s->block[i]))
722 return -1;
723 }
724
725 return 0;
726 }
727
728 block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
729 if (!block_used)
730 return -1;
731
732 do {
733 /*
734 * Also set data_series based on code prerequisites. Eg if we need
735 * CRAM_QS then we also need to know CRAM_RL so we know how long it
736 * is, or if we need FC/FP then we also need FN (number of features).
737 *
738 * It's not reciprocal though. We may be needing to decode FN
739 * but have no need to decode FC, FP and cigar ops.
740 */
741 if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP;
742 if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP;
743 if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP;
744 if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP;
745 if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP;
746 if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP;
747 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP;
748 if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP;
749 if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP;
750 if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP;
751 if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP;
752
753 // cram_decode_seq() needs seq[] array
754 if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
755
756 if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC;
757 if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN;
758 if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL;
759 if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF;
760 if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF;
761 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI;
762 if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
763 hdr->data_series |= CRAM_CF;
764 if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
765 hdr->data_series |= CRAM_CF | CRAM_NF;
766 if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
767 hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
768
769 orig_ds = hdr->data_series;
770
771 // Find which blocks are in use.
772 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
773 int bnum1, bnum2, j;
774 cram_codec *c = hdr->codecs[i_to_id[i]];
775
776 if (!(hdr->data_series & (1<<i)))
777 continue;
778
779 if (!c)
780 continue;
781
782 bnum1 = cram_codec_to_id(c, &bnum2);
783
784 for (;;) {
785 switch (bnum1) {
786 case -2:
787 break;
788
789 case -1:
790 core_used = 1;
791 break;
792
793 default:
794 for (j = 0; j < s->hdr->num_blocks; j++) {
795 if (s->block[j]->content_type == EXTERNAL &&
796 s->block[j]->content_id == bnum1) {
797 block_used[j] = 1;
798 if (cram_uncompress_block(s->block[j])) {
799 free(block_used);
800 return -1;
801 }
802 }
803 }
804 break;
805 }
806
807 if (bnum2 == -2 || bnum1 == bnum2)
808 break;
809
810 bnum1 = bnum2; // 2nd pass
811 }
812 }
813
814 // Tags too
815 if ((fd->required_fields & SAM_AUX) ||
816 (hdr->data_series & CRAM_aux)) {
817 for (i = 0; i < CRAM_MAP_HASH; i++) {
818 int bnum1, bnum2, j;
819 cram_map *m = hdr->tag_encoding_map[i];
820
821 while (m) {
822 cram_codec *c = m->codec;
823 if (!c)
824 continue;
825
826 bnum1 = cram_codec_to_id(c, &bnum2);
827
828 for (;;) {
829 switch (bnum1) {
830 case -2:
831 break;
832
833 case -1:
834 core_used = 1;
835 break;
836
837 default:
838 for (j = 0; j < s->hdr->num_blocks; j++) {
839 if (s->block[j]->content_type == EXTERNAL &&
840 s->block[j]->content_id == bnum1) {
841 block_used[j] = 1;
842 if (cram_uncompress_block(s->block[j])) {
843 free(block_used);
844 return -1;
845 }
846 }
847 }
848 break;
849 }
850
851 if (bnum2 == -2 || bnum1 == bnum2)
852 break;
853
854 bnum1 = bnum2; // 2nd pass
855 }
856
857 m = m->next;
858 }
859 }
860 }
861
862 // We now know which blocks are in used, so repeat and find
863 // which other data series need to be added.
864 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
865 int bnum1, bnum2, j;
866 cram_codec *c = hdr->codecs[i_to_id[i]];
867
868 if (!c)
869 continue;
870
871 bnum1 = cram_codec_to_id(c, &bnum2);
872
873 for (;;) {
874 switch (bnum1) {
875 case -2:
876 break;
877
878 case -1:
879 if (core_used) {
880 //printf(" + data series %08x:\n", 1<<i);
881 hdr->data_series |= 1<<i;
882 }
883 break;
884
885 default:
886 for (j = 0; j < s->hdr->num_blocks; j++) {
887 if (s->block[j]->content_type == EXTERNAL &&
888 s->block[j]->content_id == bnum1) {
889 if (block_used[j]) {
890 //printf(" + data series %08x:\n", 1<<i);
891 hdr->data_series |= 1<<i;
892 }
893 }
894 }
895 break;
896 }
897
898 if (bnum2 == -2 || bnum1 == bnum2)
899 break;
900
901 bnum1 = bnum2; // 2nd pass
902 }
903 }
904
905 // Tags too
906 for (i = 0; i < CRAM_MAP_HASH; i++) {
907 int bnum1, bnum2, j;
908 cram_map *m = hdr->tag_encoding_map[i];
909
910 while (m) {
911 cram_codec *c = m->codec;
912 if (!c)
913 continue;
914
915 bnum1 = cram_codec_to_id(c, &bnum2);
916
917 for (;;) {
918 switch (bnum1) {
919 case -2:
920 break;
921
922 case -1:
923 //printf(" + data series %08x:\n", CRAM_aux);
924 hdr->data_series |= CRAM_aux;
925 break;
926
927 default:
928 for (j = 0; j < s->hdr->num_blocks; j++) {
929 if (s->block[j]->content_type &&
930 s->block[j]->content_id == bnum1) {
931 if (block_used[j]) {
932 //printf(" + data series %08x:\n",
933 // CRAM_aux);
934 hdr->data_series |= CRAM_aux;
935 }
936 }
937 }
938 break;
939 }
940
941 if (bnum2 == -2 || bnum1 == bnum2)
942 break;
943
944 bnum1 = bnum2; // 2nd pass
945 }
946
947 m = m->next;
948 }
949 }
950 } while (orig_ds != hdr->data_series);
951
952 free(block_used);
953 return 0;
954 }
955
956 /* ----------------------------------------------------------------------
957 * CRAM slices
958 */
959
960 /*
961 * Decodes a CRAM (un)mapped slice header block.
962 * Returns slice header ptr on success
963 * NULL on failure
964 */
965 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
966 cram_block_slice_hdr *hdr;
967 char *cp = (char *)b->data;
968 int i;
969
970 if (b->content_type != MAPPED_SLICE &&
971 b->content_type != UNMAPPED_SLICE)
972 return NULL;
973
974 if (!(hdr = calloc(1, sizeof(*hdr))))
975 return NULL;
976
977 hdr->content_type = b->content_type;
978
979 if (b->content_type == MAPPED_SLICE) {
980 cp += itf8_get(cp, &hdr->ref_seq_id);
981 cp += itf8_get(cp, &hdr->ref_seq_start);
982 cp += itf8_get(cp, &hdr->ref_seq_span);
983 }
984 cp += itf8_get(cp, &hdr->num_records);
985 hdr->record_counter = 0;
986 if (CRAM_MAJOR_VERS(fd->version) == 2) {
987 int32_t i32;
988 cp += itf8_get(cp, &i32);
989 hdr->record_counter = i32;
990 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
991 cp += ltf8_get(cp, &hdr->record_counter);
992 }
993
994 cp += itf8_get(cp, &hdr->num_blocks);
995
996 cp += itf8_get(cp, &hdr->num_content_ids);
997 hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
998 if (!hdr->block_content_ids) {
999 free(hdr);
1000 return NULL;
1001 }
1002
1003 for (i = 0; i < hdr->num_content_ids; i++) {
1004 cp += itf8_get(cp, &hdr->block_content_ids[i]);
1005 }
1006
1007 if (b->content_type == MAPPED_SLICE) {
1008 cp += itf8_get(cp, &hdr->ref_base_id);
1009 }
1010
1011 if (CRAM_MAJOR_VERS(fd->version) != 1) {
1012 memcpy(hdr->md5, cp, 16);
1013 } else {
1014 memset(hdr->md5, 0, 16);
1015 }
1016
1017 return hdr;
1018 }
1019
1020
1021 #if 0
1022 /* Returns the number of bits set in val; it the highest bit used */
1023 static int nbits(int v) {
1024 static const int MultiplyDeBruijnBitPosition[32] = {
1025 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1026 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1027 };
1028
1029 v |= v >> 1; // first up to set all bits 1 after the first 1 */
1030 v |= v >> 2;
1031 v |= v >> 4;
1032 v |= v >> 8;
1033 v |= v >> 16;
1034
1035 // DeBruijn magic to find top bit
1036 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1037 }
1038 #endif
1039
1040 #if 0
1041 static int sort_freqs(const void *vp1, const void *vp2) {
1042 const int i1 = *(const int *)vp1;
1043 const int i2 = *(const int *)vp2;
1044 return i1-i2;
1045 }
1046 #endif
1047
1048 /* ----------------------------------------------------------------------
1049 * Primary CRAM sequence decoder
1050 */
1051
1052 /*
1053 * Internal part of cram_decode_slice().
1054 * Generates the sequence, quality and cigar components.
1055 */
1056 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1057 cram_block *blk, cram_record *cr, SAM_hdr *bfd,
1058 int cf, char *seq, char *qual) {
1059 int prev_pos = 0, f, r = 0, out_sz = 1;
1060 int seq_pos = 1;
1061 int cig_len = 0, ref_pos = cr->apos;
1062 int32_t fn, i32;
1063 enum cigar_op cig_op = BAM_CMATCH;
1064 uint32_t *cigar = s->cigar;
1065 uint32_t ncigar = s->ncigar;
1066 uint32_t cigar_alloc = s->cigar_alloc;
1067 uint32_t nm = 0, md_dist = 0;
1068 int orig_aux = 0;
1069 int decode_md = fd->decode_md && s->ref;
1070 uint32_t ds = c->comp_hdr->data_series;
1071
1072 if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1073 memset(qual, 30, cr->len);
1074 }
1075
1076 if (decode_md) {
1077 orig_aux = BLOCK_SIZE(s->aux_blk);
1078 BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1079 }
1080
1081 if (ds & CRAM_FN) {
1082 if (!c->comp_hdr->codecs[DS_FN]) return -1;
1083 r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
1084 blk, (char *)&fn, &out_sz);
1085 } else {
1086 fn = 0;
1087 }
1088
1089 ref_pos--; // count from 0
1090 cr->cigar = ncigar;
1091
1092 if (!(ds & (CRAM_FC | CRAM_FP)))
1093 goto skip_cigar;
1094
1095 for (f = 0; f < fn; f++) {
1096 int32_t pos = 0;
1097 char op;
1098
1099 if (ncigar+2 >= cigar_alloc) {
1100 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1101 s->cigar = cigar;
1102 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1103 return -1;
1104 }
1105
1106 if (ds & CRAM_FC) {
1107 if (!c->comp_hdr->codecs[DS_FC]) return -1;
1108 r |= c->comp_hdr->codecs[DS_FC]->decode(s,
1109 c->comp_hdr->codecs[DS_FC],
1110 blk,
1111 &op, &out_sz);
1112 }
1113
1114 if (!(ds & CRAM_FP))
1115 continue;
1116
1117 if (!c->comp_hdr->codecs[DS_FP]) return -1;
1118 r |= c->comp_hdr->codecs[DS_FP]->decode(s,
1119 c->comp_hdr->codecs[DS_FP],
1120 blk,
1121 (char *)&pos, &out_sz);
1122 pos += prev_pos;
1123
1124 if (pos > seq_pos) {
1125 if (pos > cr->len+1)
1126 return -1;
1127
1128 if (s->ref && cr->ref_id >= 0) {
1129 if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1130 static int whinged = 0;
1131 if (!whinged)
1132 fprintf(stderr, "Ref pos outside of ref "
1133 "sequence boundary\n");
1134 whinged = 1;
1135 } else {
1136 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1137 pos - seq_pos);
1138 }
1139 }
1140 #ifdef USE_X
1141 if (cig_len && cig_op != BAM_CBASE_MATCH) {
1142 cigar[ncigar++] = (cig_len<<4) + cig_op;
1143 cig_len = 0;
1144 }
1145 cig_op = BAM_CBASE_MATCH;
1146 #else
1147 if (cig_len && cig_op != BAM_CMATCH) {
1148 cigar[ncigar++] = (cig_len<<4) + cig_op;
1149 cig_len = 0;
1150 }
1151 cig_op = BAM_CMATCH;
1152 #endif
1153 cig_len += pos - seq_pos;
1154 ref_pos += pos - seq_pos;
1155 md_dist += pos - seq_pos;
1156 seq_pos = pos;
1157 }
1158
1159 prev_pos = pos;
1160
1161 if (!(ds & CRAM_FC))
1162 goto skip_cigar;
1163
1164 if (!(ds & CRAM_FC))
1165 continue;
1166
1167 switch(op) {
1168 case 'S': { // soft clip: IN
1169 int32_t out_sz2 = 1;
1170
1171 if (cig_len) {
1172 cigar[ncigar++] = (cig_len<<4) + cig_op;
1173 cig_len = 0;
1174 }
1175 if (ds & CRAM_IN) {
1176 switch (CRAM_MAJOR_VERS(fd->version)) {
1177 case 1:
1178 r |= c->comp_hdr->codecs[DS_IN]
1179 ? c->comp_hdr->codecs[DS_IN]
1180 ->decode(s, c->comp_hdr->codecs[DS_IN],
1181 blk, &seq[pos-1], &out_sz2)
1182 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1183 break;
1184
1185 case 2:
1186 default:
1187 r |= c->comp_hdr->codecs[DS_SC]
1188 ? c->comp_hdr->codecs[DS_SC]
1189 ->decode(s, c->comp_hdr->codecs[DS_SC],
1190 blk, &seq[pos-1], &out_sz2)
1191 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1192 break;
1193
1194 // default:
1195 // r |= c->comp_hdr->codecs[DS_BB]
1196 // ? c->comp_hdr->codecs[DS_BB]
1197 // ->decode(s, c->comp_hdr->codecs[DS_BB],
1198 // blk, &seq[pos-1], &out_sz2)
1199 // : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1200 }
1201 cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1202 cig_op = BAM_CSOFT_CLIP;
1203 seq_pos += out_sz2;
1204 }
1205 break;
1206 }
1207
1208 case 'X': { // Substitution; BS
1209 unsigned char base;
1210 #ifdef USE_X
1211 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1212 cigar[ncigar++] = (cig_len<<4) + cig_op;
1213 cig_len = 0;
1214 }
1215 if (ds & CRAM_BS) {
1216 if (!c->comp_hdr->codecs[DS_BS]) return -1;
1217 r |= c->comp_hdr->codecs[DS_BS]
1218 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1219 (char *)&base, &out_sz);
1220 seq[pos-1] = 'N'; // FIXME look up BS=base value
1221 }
1222 cig_op = BAM_CBASE_MISMATCH;
1223 #else
1224 int ref_base;
1225 if (cig_len && cig_op != BAM_CMATCH) {
1226 cigar[ncigar++] = (cig_len<<4) + cig_op;
1227 cig_len = 0;
1228 }
1229 if (ds & CRAM_BS) {
1230 if (!c->comp_hdr->codecs[DS_BS]) return -1;
1231 r |= c->comp_hdr->codecs[DS_BS]
1232 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1233 (char *)&base, &out_sz);
1234 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1235 seq[pos-1] = 'N';
1236 } else {
1237 ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]];
1238 seq[pos-1] = c->comp_hdr->
1239 substitution_matrix[ref_base][base];
1240 if (decode_md) {
1241 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1242 BLOCK_APPEND_CHAR(s->aux_blk,
1243 s->ref[ref_pos-s->ref_start +1]);
1244 md_dist = 0;
1245 }
1246 }
1247 }
1248 cig_op = BAM_CMATCH;
1249 #endif
1250 nm++;
1251 cig_len++;
1252 seq_pos++;
1253 ref_pos++;
1254 break;
1255 }
1256
1257 case 'D': { // Deletion; DL
1258 if (cig_len && cig_op != BAM_CDEL) {
1259 cigar[ncigar++] = (cig_len<<4) + cig_op;
1260 cig_len = 0;
1261 }
1262 if (ds & CRAM_DL) {
1263 if (!c->comp_hdr->codecs[DS_DL]) return -1;
1264 r |= c->comp_hdr->codecs[DS_DL]
1265 ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
1266 (char *)&i32, &out_sz);
1267 if (decode_md) {
1268 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1269 BLOCK_APPEND_CHAR(s->aux_blk, '^');
1270 BLOCK_APPEND(s->aux_blk,
1271 &s->ref[ref_pos - s->ref_start +1],
1272 i32);
1273 md_dist = 0;
1274 }
1275 cig_op = BAM_CDEL;
1276 cig_len += i32;
1277 ref_pos += i32;
1278 nm += i32;
1279 //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
1280 }
1281 break;
1282 }
1283
1284 case 'I': { // Insertion (several bases); IN
1285 int32_t out_sz2 = 1;
1286
1287 if (cig_len && cig_op != BAM_CINS) {
1288 cigar[ncigar++] = (cig_len<<4) + cig_op;
1289 cig_len = 0;
1290 }
1291
1292 if (ds & CRAM_IN) {
1293 if (!c->comp_hdr->codecs[DS_IN]) return -1;
1294 r |= c->comp_hdr->codecs[DS_IN]
1295 ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
1296 &seq[pos-1], &out_sz2);
1297 cig_op = BAM_CINS;
1298 cig_len += out_sz2;
1299 seq_pos += out_sz2;
1300 nm += out_sz2;
1301 //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1302 }
1303 break;
1304 }
1305
1306 case 'i': { // Insertion (single base); BA
1307 if (cig_len && cig_op != BAM_CINS) {
1308 cigar[ncigar++] = (cig_len<<4) + cig_op;
1309 cig_len = 0;
1310 }
1311 if (ds & CRAM_BA) {
1312 if (!c->comp_hdr->codecs[DS_BA]) return -1;
1313 r |= c->comp_hdr->codecs[DS_BA]
1314 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1315 (char *)&seq[pos-1], &out_sz);
1316 }
1317 cig_op = BAM_CINS;
1318 cig_len++;
1319 seq_pos++;
1320 nm++;
1321 break;
1322 }
1323
1324 case 'b': { // Several bases
1325 int32_t len = 1;
1326
1327 if (cig_len && cig_op != BAM_CMATCH) {
1328 cigar[ncigar++] = (cig_len<<4) + cig_op;
1329 cig_len = 0;
1330 }
1331
1332 if (ds & CRAM_BB) {
1333 if (!c->comp_hdr->codecs[DS_BB]) return -1;
1334 r |= c->comp_hdr->codecs[DS_BB]
1335 ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
1336 (char *)&seq[pos-1], &len);
1337 }
1338
1339 cig_op = BAM_CMATCH;
1340
1341 cig_len+=len;
1342 seq_pos+=len;
1343 ref_pos+=len;
1344 //prev_pos+=len;
1345 break;
1346 }
1347
1348 case 'q': { // Several quality values
1349 int32_t len = 1;
1350
1351 if (cig_len && cig_op != BAM_CMATCH) {
1352 cigar[ncigar++] = (cig_len<<4) + cig_op;
1353 cig_len = 0;
1354 }
1355
1356 if (ds & CRAM_QQ) {
1357 if (!c->comp_hdr->codecs[DS_QQ]) return -1;
1358 r |= c->comp_hdr->codecs[DS_QQ]
1359 ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
1360 (char *)&qual[pos-1], &len);
1361 }
1362
1363 cig_op = BAM_CMATCH;
1364
1365 cig_len+=len;
1366 seq_pos+=len;
1367 ref_pos+=len;
1368 //prev_pos+=len;
1369 break;
1370 }
1371
1372 case 'B': { // Read base; BA, QS
1373 #ifdef USE_X
1374 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1375 cigar[ncigar++] = (cig_len<<4) + cig_op;
1376 cig_len = 0;
1377 }
1378 #else
1379 if (cig_len && cig_op != BAM_CMATCH) {
1380 cigar[ncigar++] = (cig_len<<4) + cig_op;
1381 cig_len = 0;
1382 }
1383 #endif
1384 if (ds & CRAM_BA) {
1385 if (!c->comp_hdr->codecs[DS_BA]) return -1;
1386 r |= c->comp_hdr->codecs[DS_BA]
1387 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1388 (char *)&seq[pos-1], &out_sz);
1389 }
1390 if (ds & CRAM_QS) {
1391 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1392 r |= c->comp_hdr->codecs[DS_QS]
1393 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1394 (char *)&qual[pos-1], &out_sz);
1395 }
1396 #ifdef USE_X
1397 cig_op = BAM_CBASE_MISMATCH;
1398 #else
1399 cig_op = BAM_CMATCH;
1400 #endif
1401 cig_len++;
1402 seq_pos++;
1403 ref_pos++;
1404 //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1405 break;
1406 }
1407
1408 case 'Q': { // Quality score; QS
1409 if (ds & CRAM_QS) {
1410 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1411 r |= c->comp_hdr->codecs[DS_QS]
1412 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1413 (char *)&qual[pos-1], &out_sz);
1414 //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
1415 }
1416 break;
1417 }
1418
1419 case 'H': { // hard clip; HC
1420 if (cig_len && cig_op != BAM_CHARD_CLIP) {
1421 cigar[ncigar++] = (cig_len<<4) + cig_op;
1422 cig_len = 0;
1423 }
1424 if (ds & CRAM_HC) {
1425 if (!c->comp_hdr->codecs[DS_HC]) return -1;
1426 r |= c->comp_hdr->codecs[DS_HC]
1427 ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
1428 (char *)&i32, &out_sz);
1429 cig_op = BAM_CHARD_CLIP;
1430 cig_len += i32;
1431 nm += i32;
1432 }
1433 break;
1434 }
1435
1436 case 'P': { // padding; PD
1437 if (cig_len && cig_op != BAM_CPAD) {
1438 cigar[ncigar++] = (cig_len<<4) + cig_op;
1439 cig_len = 0;
1440 }
1441 if (ds & CRAM_PD) {
1442 if (!c->comp_hdr->codecs[DS_PD]) return -1;
1443 r |= c->comp_hdr->codecs[DS_PD]
1444 ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
1445 (char *)&i32, &out_sz);
1446 cig_op = BAM_CPAD;
1447 cig_len += i32;
1448 nm += i32;
1449 }
1450 break;
1451 }
1452
1453 case 'N': { // Ref skip; RS
1454 if (cig_len && cig_op != BAM_CREF_SKIP) {
1455 cigar[ncigar++] = (cig_len<<4) + cig_op;
1456 cig_len = 0;
1457 }
1458 if (ds & CRAM_RS) {
1459 if (!c->comp_hdr->codecs[DS_RS]) return -1;
1460 r |= c->comp_hdr->codecs[DS_RS]
1461 ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
1462 (char *)&i32, &out_sz);
1463 cig_op = BAM_CREF_SKIP;
1464 cig_len += i32;
1465 ref_pos += i32;
1466 nm += i32;
1467 }
1468 break;
1469 }
1470
1471 default:
1472 abort();
1473 }
1474 }
1475
1476 if (!(ds & CRAM_FC))
1477 goto skip_cigar;
1478
1479 /* An implement match op for any unaccounted for bases */
1480 if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1481 if (s->ref) {
1482 if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1483 static int whinged = 0;
1484 if (!whinged)
1485 fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
1486 whinged = 1;
1487 } else {
1488 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1489 cr->len - seq_pos + 1);
1490 ref_pos += cr->len - seq_pos + 1;
1491 md_dist += cr->len - seq_pos + 1;
1492 }
1493 }
1494
1495 if (ncigar+1 >= cigar_alloc) {
1496 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1497 s->cigar = cigar;
1498 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1499 return -1;
1500 }
1501 #ifdef USE_X
1502 if (cig_len && cig_op != BAM_CBASE_MATCH) {
1503 cigar[ncigar++] = (cig_len<<4) + cig_op;
1504 cig_len = 0;
1505 }
1506 cig_op = BAM_CBASE_MATCH;
1507 #else
1508 if (cig_len && cig_op != BAM_CMATCH) {
1509 cigar[ncigar++] = (cig_len<<4) + cig_op;
1510 cig_len = 0;
1511 }
1512 cig_op = BAM_CMATCH;
1513 #endif
1514 cig_len += cr->len - seq_pos+1;
1515 }
1516
1517 skip_cigar:
1518
1519 if ((ds & CRAM_FN) && decode_md) {
1520 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1521 }
1522
1523 if (cig_len) {
1524 if (ncigar >= cigar_alloc) {
1525 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1526 s->cigar = cigar;
1527 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1528 return -1;
1529 }
1530
1531 cigar[ncigar++] = (cig_len<<4) + cig_op;
1532 }
1533
1534 cr->ncigar = ncigar - cr->cigar;
1535 cr->aend = ref_pos;
1536
1537 //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1538
1539 if (ds & CRAM_MQ) {
1540 if (!c->comp_hdr->codecs[DS_MQ]) return -1;
1541 r |= c->comp_hdr->codecs[DS_MQ]
1542 ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
1543 (char *)&cr->mqual, &out_sz);
1544 } else {
1545 cr->mqual = 40;
1546 }
1547
1548 if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1549 int32_t out_sz2 = cr->len;
1550
1551 if (ds & CRAM_QS) {
1552 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1553 r |= c->comp_hdr->codecs[DS_QS]
1554 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1555 qual, &out_sz2);
1556 }
1557 }
1558
1559 s->cigar = cigar;
1560 s->cigar_alloc = cigar_alloc;
1561 s->ncigar = ncigar;
1562
1563 if (decode_md) {
1564 char buf[7];
1565 BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1566 cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
1567 buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
1568 buf[3] = (nm>> 0) & 0xff;
1569 buf[4] = (nm>> 8) & 0xff;
1570 buf[5] = (nm>>16) & 0xff;
1571 buf[6] = (nm>>24) & 0xff;
1572 BLOCK_APPEND(s->aux_blk, buf, 7);
1573 cr->aux_size += 7;
1574 }
1575
1576 return r;
1577 }
1578
1579 /*
1580 * Quick and simple hash lookup for cram_map arrays
1581 */
1582 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1583 cram_map *m;
1584
1585 m = map[CRAM_MAP(key[0],key[1])];
1586 while (m && m->key != id)
1587 m= m->next;
1588
1589 return m;
1590 }
1591
1592 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1593
1594
1595 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1596 cram_block *blk, cram_record *cr) {
1597 int i, r = 0, out_sz = 1;
1598 unsigned char ntags;
1599
1600 if (!c->comp_hdr->codecs[DS_TC]) return -1;
1601 r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1602 (char *)&ntags, &out_sz);
1603 cr->ntags = ntags;
1604
1605 //printf("TC=%d\n", cr->ntags);
1606 cr->aux_size = 0;
1607 cr->aux = BLOCK_SIZE(s->aux_blk);
1608
1609 for (i = 0; i < cr->ntags; i++) {
1610 int32_t id, out_sz = 1;
1611 unsigned char tag_data[3];
1612 cram_map *m;
1613
1614 //printf("Tag %d/%d\n", i+1, cr->ntags);
1615 if (!c->comp_hdr->codecs[DS_TN]) return -1;
1616 r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1617 blk, (char *)&id, &out_sz);
1618 if (out_sz == 3) {
1619 tag_data[0] = ((char *)&id)[0];
1620 tag_data[1] = ((char *)&id)[1];
1621 tag_data[2] = ((char *)&id)[2];
1622 } else {
1623 tag_data[0] = (id>>16) & 0xff;
1624 tag_data[1] = (id>>8) & 0xff;
1625 tag_data[2] = id & 0xff;
1626 }
1627
1628 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1629 if (!m)
1630 return -1;
1631 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1632
1633 if (!m->codec) return -1;
1634 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1635
1636 cr->aux_size += out_sz + 3;
1637 }
1638
1639 return r;
1640 }
1641
1642 static int cram_decode_aux(cram_container *c, cram_slice *s,
1643 cram_block *blk, cram_record *cr) {
1644 int i, r = 0, out_sz = 1;
1645 int32_t TL = 0;
1646 unsigned char *TN;
1647 uint32_t ds = c->comp_hdr->data_series;
1648
1649 if (!(ds & (CRAM_TL|CRAM_aux))) {
1650 cr->aux = 0;
1651 cr->aux_size = 0;
1652 return 0;
1653 }
1654
1655 if (!c->comp_hdr->codecs[DS_TL]) return -1;
1656 r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1657 (char *)&TL, &out_sz);
1658 if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1659 return -1;
1660
1661 TN = c->comp_hdr->TL[TL];
1662 cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1663
1664 //printf("TC=%d\n", cr->ntags);
1665 cr->aux_size = 0;
1666 cr->aux = BLOCK_SIZE(s->aux_blk);
1667
1668 if (!(ds & CRAM_aux))
1669 return 0;
1670
1671 for (i = 0; i < cr->ntags; i++) {
1672 int32_t id, out_sz = 1;
1673 unsigned char tag_data[3];
1674 cram_map *m;
1675
1676 //printf("Tag %d/%d\n", i+1, cr->ntags);
1677 tag_data[0] = *TN++;
1678 tag_data[1] = *TN++;
1679 tag_data[2] = *TN++;
1680 id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
1681
1682 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1683 if (!m)
1684 return -1;
1685 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1686
1687 if (!m->codec) return -1;
1688 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1689 cr->aux_size += out_sz + 3;
1690 }
1691
1692 return r;
1693 }
1694
1695 /* Resolve mate pair cross-references between recs within this slice */
1696 static void cram_decode_slice_xref(cram_slice *s, int required_fields) {
1697 int rec;
1698
1699 if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
1700 for (rec = 0; rec < s->hdr->num_records; rec++) {
1701 cram_record *cr = &s->crecs[rec];
1702
1703 cr->tlen = 0;
1704 cr->mate_pos = 0;
1705 cr->mate_ref_id = -1;
1706 }
1707
1708 return;
1709 }
1710
1711 for (rec = 0; rec < s->hdr->num_records; rec++) {
1712 cram_record *cr = &s->crecs[rec];
1713
1714 if (cr->mate_line >= 0) {
1715 if (cr->mate_line < s->hdr->num_records) {
1716 /*
1717 * On the first read, loop through computing lengths.
1718 * It's not perfect as we have one slice per reference so we
1719 * cannot detect when TLEN should be zero due to seqs that
1720 * map to multiple references.
1721 *
1722 * We also cannot set tlen correct when it spans a slice for
1723 * other reasons. This may make tlen too small. Should we
1724 * fix this by forcing TLEN to be stored verbatim in such cases?
1725 *
1726 * Or do we just admit defeat and output 0 for tlen? It's the
1727 * safe option...
1728 */
1729 if (cr->tlen == INT_MIN) {
1730 int id1 = rec, id2 = rec;
1731 int aleft = cr->apos, aright = cr->aend;
1732 int tlen;
1733 int ref = cr->ref_id;
1734
1735 // number of segments starting at the same point.
1736 int left_cnt = 0;
1737
1738 do {
1739 if (aleft > s->crecs[id2].apos)
1740 aleft = s->crecs[id2].apos, left_cnt = 1;
1741 else if (aleft == s->crecs[id2].apos)
1742 left_cnt++;
1743 if (aright < s->crecs[id2].aend)
1744 aright = s->crecs[id2].aend;
1745 if (s->crecs[id2].mate_line == -1) {
1746 s->crecs[id2].mate_line = rec;
1747 break;
1748 }
1749 assert(s->crecs[id2].mate_line > id2);
1750 id2 = s->crecs[id2].mate_line;
1751
1752 if (s->crecs[id2].ref_id != ref)
1753 ref = -1;
1754 } while (id2 != id1);
1755
1756 if (ref != -1) {
1757 tlen = aright - aleft + 1;
1758 id1 = id2 = rec;
1759
1760 /*
1761 * When we have two seqs with identical start and
1762 * end coordinates, set +/- tlen based on 1st/last
1763 * bit flags instead, as a tie breaker.
1764 */
1765 if (s->crecs[id2].apos == aleft) {
1766 if (left_cnt == 1 ||
1767 (s->crecs[id2].flags & BAM_FREAD1))
1768 s->crecs[id2].tlen = tlen;
1769 else
1770 s->crecs[id2].tlen = -tlen;
1771 } else {
1772 s->crecs[id2].tlen = -tlen;
1773 }
1774
1775 id2 = s->crecs[id2].mate_line;
1776 while (id2 != id1) {
1777 if (s->crecs[id2].apos == aleft) {
1778 if (left_cnt == 1 ||
1779 (s->crecs[id2].flags & BAM_FREAD1))
1780 s->crecs[id2].tlen = tlen;
1781 else
1782 s->crecs[id2].tlen = -tlen;
1783 } else {
1784 s->crecs[id2].tlen = -tlen;
1785 }
1786 id2 = s->crecs[id2].mate_line;
1787 }
1788 } else {
1789 id1 = id2 = rec;
1790
1791 s->crecs[id2].tlen = 0;
1792 id2 = s->crecs[id2].mate_line;
1793 while (id2 != id1) {
1794 s->crecs[id2].tlen = 0;
1795 id2 = s->crecs[id2].mate_line;
1796 }
1797 }
1798 }
1799
1800 cr->mate_pos = s->crecs[cr->mate_line].apos;
1801 cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
1802
1803 // paired
1804 cr->flags |= BAM_FPAIRED;
1805
1806 // set mate unmapped if needed
1807 if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
1808 cr->flags |= BAM_FMUNMAP;
1809 cr->tlen = 0;
1810 }
1811 if (cr->flags & BAM_FUNMAP) {
1812 cr->tlen = 0;
1813 }
1814
1815 // set mate reversed if needed
1816 if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
1817 cr->flags |= BAM_FMREVERSE;
1818 } else {
1819 fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
1820 cr->mate_line, s->hdr->num_records-1);
1821 }
1822
1823 /* FIXME: construct read names here too if needed */
1824 } else {
1825 if (cr->mate_flags & CRAM_M_REVERSE) {
1826 cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
1827 }
1828 if (cr->mate_flags & CRAM_M_UNMAP) {
1829 cr->flags |= BAM_FMUNMAP;
1830 //cr->mate_ref_id = -1;
1831 }
1832 if (!(cr->flags & BAM_FPAIRED))
1833 cr->mate_ref_id = -1;
1834 }
1835
1836 if (cr->tlen == INT_MIN)
1837 cr->tlen = 0; // Just incase
1838 }
1839 }
1840
1841 static char *md5_print(unsigned char *md5, char *out) {
1842 int i;
1843 for (i = 0; i < 16; i++) {
1844 out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
1845 out[i*2+1] = "0123456789abcdef"[md5[i]&15];
1846 }
1847 out[32] = 0;
1848
1849 return out;
1850 }
1851
1852 /*
1853 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
1854 * Returns 0 on success
1855 * -1 on failure
1856 */
1857 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
1858 SAM_hdr *bfd) {
1859 cram_block *blk = s->block[0];
1860 int32_t bf, ref_id;
1861 unsigned char cf;
1862 int out_sz, r = 0;
1863 int rec;
1864 char *seq = NULL, *qual = NULL;
1865 int unknown_rg = -1;
1866 int embed_ref;
1867 char **refs = NULL;
1868 uint32_t ds;
1869
1870 if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
1871 return -1;
1872
1873 ds = c->comp_hdr->data_series;
1874
1875 blk->bit = 7; // MSB first
1876
1877 /* Look for unknown RG, added as last by Java CRAM? */
1878 if (bfd->nrg > 0 &&
1879 !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
1880 unknown_rg = bfd->nrg-1;
1881
1882 if (blk->content_type != CORE)
1883 return -1;
1884
1885 if (s->crecs)
1886 free(s->crecs);
1887 if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
1888 return -1;
1889
1890 ref_id = s->hdr->ref_seq_id;
1891 embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
1892
1893 if (ref_id >= 0) {
1894 if (embed_ref) {
1895 cram_block *b;
1896 if (s->hdr->ref_base_id < 0) {
1897 fprintf(stderr, "No reference specified and "
1898 "no embedded reference is available.\n");
1899 return -1;
1900 }
1901 if (!s->block_by_id ||
1902 !(b = s->block_by_id[s->hdr->ref_base_id]))
1903 return -1;
1904 cram_uncompress_block(b);
1905 s->ref = (char *)BLOCK_DATA(b);
1906 s->ref_start = s->hdr->ref_seq_start;
1907 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
1908 } else if (!fd->no_ref) {
1909 //// Avoid Java cramtools bug by loading entire reference seq
1910 //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
1911 //s->ref_start = 1;
1912
1913 if (fd->required_fields & SAM_SEQ)
1914 s->ref =
1915 cram_get_ref(fd, s->hdr->ref_seq_id,
1916 s->hdr->ref_seq_start,
1917 s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
1918 s->ref_start = s->hdr->ref_seq_start;
1919 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
1920
1921 /* Sanity check */
1922 if (s->ref_start < 0) {
1923 fprintf(stderr, "Slice starts before base 1.\n");
1924 s->ref_start = 0;
1925 }
1926 pthread_mutex_lock(&fd->ref_lock);
1927 pthread_mutex_lock(&fd->refs->lock);
1928 if ((fd->required_fields & SAM_SEQ) &&
1929 s->ref_end > fd->refs->ref_id[ref_id]->length) {
1930 fprintf(stderr, "Slice ends beyond reference end.\n");
1931 s->ref_end = fd->refs->ref_id[ref_id]->length;
1932 }
1933 pthread_mutex_unlock(&fd->refs->lock);
1934 pthread_mutex_unlock(&fd->ref_lock);
1935 }
1936 }
1937
1938 if ((fd->required_fields & SAM_SEQ) &&
1939 s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
1940 fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
1941 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
1942 s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
1943 return -1;
1944 }
1945
1946 if (CRAM_MAJOR_VERS(fd->version) != 1
1947 && (fd->required_fields & SAM_SEQ)
1948 && s->hdr->ref_seq_id >= 0
1949 && !fd->ignore_md5
1950 && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
1951 MD5_CTX md5;
1952 unsigned char digest[16];
1953
1954 if (s->ref && s->hdr->ref_seq_id >= 0) {
1955 int start, len;
1956
1957 if (s->hdr->ref_seq_start >= s->ref_start) {
1958 start = s->hdr->ref_seq_start - s->ref_start;
1959 } else {
1960 fprintf(stderr, "Slice starts before base 1.\n");
1961 start = 0;
1962 }
1963
1964 if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
1965 len = s->hdr->ref_seq_span;
1966 } else {
1967 fprintf(stderr, "Slice ends beyond reference end.\n");
1968 len = s->ref_end - s->ref_start + 1;
1969 }
1970
1971 MD5_Init(&md5);
1972 if (start + len > s->ref_end - s->ref_start + 1)
1973 len = s->ref_end - s->ref_start + 1 - start;
1974 if (len >= 0)
1975 MD5_Update(&md5, s->ref + start, len);
1976 MD5_Final(digest, &md5);
1977 } else if (!s->ref && s->hdr->ref_base_id >= 0) {
1978 cram_block *b;
1979 if (s->block_by_id && (b = s->block_by_id[s->hdr->ref_base_id])) {
1980 MD5_Init(&md5);
1981 MD5_Update(&md5, b->data, b->uncomp_size);
1982 MD5_Final(digest, &md5);
1983 }
1984 }
1985
1986 if ((!s->ref && s->hdr->ref_base_id < 0)
1987 || memcmp(digest, s->hdr->md5, 16) != 0) {
1988 char M[33];
1989 fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
1990 "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
1991 fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
1992 fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
1993 return -1;
1994 }
1995 }
1996
1997 if (ref_id == -2) {
1998 pthread_mutex_lock(&fd->ref_lock);
1999 pthread_mutex_lock(&fd->refs->lock);
2000 refs = calloc(fd->refs->nref, sizeof(char *));
2001 pthread_mutex_unlock(&fd->refs->lock);
2002 pthread_mutex_unlock(&fd->ref_lock);
2003 if (!refs)
2004 return -1;
2005 }
2006
2007 for (rec = 0; rec < s->hdr->num_records; rec++) {
2008 cram_record *cr = &s->crecs[rec];
2009
2010 //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2011
2012 cr->s = s;
2013
2014 out_sz = 1; /* decode 1 item */
2015 if (ds & CRAM_BF) {
2016 if (!c->comp_hdr->codecs[DS_BF]) return -1;
2017 r |= c->comp_hdr->codecs[DS_BF]
2018 ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2019 (char *)&bf, &out_sz);
2020 if (bf < 0 ||
2021 bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2022 return -1;
2023 bf = fd->bam_flag_swap[bf];
2024 cr->flags = bf;
2025 } else {
2026 cr->flags = bf = 0x4; // unmapped
2027 }
2028
2029 if (ds & CRAM_CF) {
2030 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2031 /* CF is byte in 1.0, int32 in 2.0 */
2032 if (!c->comp_hdr->codecs[DS_CF]) return -1;
2033 r |= c->comp_hdr->codecs[DS_CF]
2034 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2035 (char *)&cf, &out_sz);
2036 cr->cram_flags = cf;
2037 } else {
2038 if (!c->comp_hdr->codecs[DS_CF]) return -1;
2039 r |= c->comp_hdr->codecs[DS_CF]
2040 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2041 (char *)&cr->cram_flags,
2042 &out_sz);
2043 cf = cr->cram_flags;
2044 }
2045 }
2046
2047 if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2048 if (ds & CRAM_RI) {
2049 if (!c->comp_hdr->codecs[DS_RI]) return -1;
2050 r |= c->comp_hdr->codecs[DS_RI]
2051 ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2052 (char *)&cr->ref_id, &out_sz);
2053 if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2054 && cr->ref_id >= 0) {
2055 if (!fd->no_ref) {
2056 if (!refs[cr->ref_id])
2057 refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
2058 1, 0);
2059 s->ref = refs[cr->ref_id];
2060 }
2061 s->ref_start = 1;
2062 pthread_mutex_lock(&fd->ref_lock);
2063 pthread_mutex_lock(&fd->refs->lock);
2064 s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2065 pthread_mutex_unlock(&fd->refs->lock);
2066 pthread_mutex_unlock(&fd->ref_lock);
2067 }
2068 } else {
2069 cr->ref_id = 0;
2070 }
2071 } else {
2072 cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2073 }
2074
2075
2076 if (ds & CRAM_RL) {
2077 if (!c->comp_hdr->codecs[DS_RL]) return -1;
2078 r |= c->comp_hdr->codecs[DS_RL]
2079 ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2080 (char *)&cr->len, &out_sz);
2081 }
2082
2083 if (ds & CRAM_AP) {
2084 if (!c->comp_hdr->codecs[DS_AP]) return -1;
2085 r |= c->comp_hdr->codecs[DS_AP]
2086 ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2087 (char *)&cr->apos, &out_sz);
2088 if (c->comp_hdr->AP_delta)
2089 cr->apos += s->last_apos;
2090 s->last_apos= cr->apos;
2091 } else {
2092 cr->apos = c->ref_seq_start;
2093 }
2094
2095 if (ds & CRAM_RG) {
2096 if (!c->comp_hdr->codecs[DS_RG]) return -1;
2097 r |= c->comp_hdr->codecs[DS_RG]
2098 ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2099 (char *)&cr->rg, &out_sz);
2100 if (cr->rg == unknown_rg)
2101 cr->rg = -1;
2102 } else {
2103 cr->rg = -1;
2104 }
2105
2106 cr->name_len = 0;
2107
2108 if (c->comp_hdr->read_names_included) {
2109 int32_t out_sz2 = 1;
2110
2111 // Read directly into name cram_block
2112 cr->name = BLOCK_SIZE(s->name_blk);
2113 if (ds & CRAM_RN) {
2114 if (!c->comp_hdr->codecs[DS_RN]) return -1;
2115 r |= c->comp_hdr->codecs[DS_RN]
2116 ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2117 (char *)s->name_blk, &out_sz2);
2118 cr->name_len = out_sz2;
2119 }
2120 }
2121
2122 cr->mate_pos = 0;
2123 cr->mate_line = -1;
2124 cr->mate_ref_id = -1;
2125 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2126 if (ds & CRAM_MF) {
2127 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2128 /* MF is byte in 1.0, int32 in 2.0 */
2129 unsigned char mf;
2130 if (!c->comp_hdr->codecs[DS_MF]) return -1;
2131 r |= c->comp_hdr->codecs[DS_MF]
2132 ->decode(s, c->comp_hdr->codecs[DS_MF],
2133 blk, (char *)&mf, &out_sz);
2134 cr->mate_flags = mf;
2135 } else {
2136 if (!c->comp_hdr->codecs[DS_MF]) return -1;
2137 r |= c->comp_hdr->codecs[DS_MF]
2138 ->decode(s, c->comp_hdr->codecs[DS_MF],
2139 blk,
2140 (char *)&cr->mate_flags,
2141 &out_sz);
2142 }
2143 } else {
2144 cr->mate_flags = 0;
2145 }
2146
2147 if (!c->comp_hdr->read_names_included) {
2148 int32_t out_sz2 = 1;
2149
2150 // Read directly into name cram_block
2151 cr->name = BLOCK_SIZE(s->name_blk);
2152 if (ds & CRAM_RN) {
2153 if (!c->comp_hdr->codecs[DS_RN]) return -1;
2154 r |= c->comp_hdr->codecs[DS_RN]
2155 ->decode(s, c->comp_hdr->codecs[DS_RN],
2156 blk, (char *)s->name_blk,
2157 &out_sz2);
2158 cr->name_len = out_sz2;
2159 }
2160 }
2161
2162 if (ds & CRAM_NS) {
2163 if (!c->comp_hdr->codecs[DS_NS]) return -1;
2164 r |= c->comp_hdr->codecs[DS_NS]
2165 ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2166 (char *)&cr->mate_ref_id, &out_sz);
2167 }
2168
2169 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2170 // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2171 // /* Paired, but unmapped */
2172 // cr->flags |= BAM_FMUNMAP;
2173 // }
2174
2175 if (ds & CRAM_NP) {
2176 if (!c->comp_hdr->codecs[DS_NP]) return -1;
2177 r |= c->comp_hdr->codecs[DS_NP]
2178 ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2179 (char *)&cr->mate_pos, &out_sz);
2180 }
2181
2182 if (ds & CRAM_TS) {
2183 if (!c->comp_hdr->codecs[DS_TS]) return -1;
2184 r |= c->comp_hdr->codecs[DS_TS]
2185 ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2186 (char *)&cr->tlen, &out_sz);
2187 } else {
2188 cr->tlen = INT_MIN;
2189 }
2190 } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2191 if (ds & CRAM_NF) {
2192 if (!c->comp_hdr->codecs[DS_NF]) return -1;
2193 r |= c->comp_hdr->codecs[DS_NF]
2194 ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2195 (char *)&cr->mate_line, &out_sz);
2196 cr->mate_line += rec + 1;
2197
2198 //cr->name_len = sprintf(name, "%d", name_id++);
2199 //cr->name = DSTRING_LEN(name_ds);
2200 //dstring_nappend(name_ds, name, cr->name_len);
2201
2202 cr->mate_ref_id = -1;
2203 cr->tlen = INT_MIN;
2204 cr->mate_pos = 0;
2205 } else {
2206 cr->mate_flags = 0;
2207 cr->tlen = INT_MIN;
2208 }
2209 } else {
2210 cr->mate_flags = 0;
2211 cr->tlen = INT_MIN;
2212 }
2213 /*
2214 else if (!name[0]) {
2215 //name[0] = '?'; name[1] = 0;
2216 //cr->name_len = 1;
2217 //cr->name= DSTRING_LEN(s->name_ds);
2218 //dstring_nappend(s->name_ds, "?", 1);
2219
2220 cr->mate_ref_id = -1;
2221 cr->tlen = 0;
2222 cr->mate_pos = 0;
2223 }
2224 */
2225
2226 /* Auxiliary tags */
2227 if (CRAM_MAJOR_VERS(fd->version) == 1)
2228 r |= cram_decode_aux_1_0(c, s, blk, cr);
2229 else
2230 r |= cram_decode_aux(c, s, blk, cr);
2231
2232 /* Fake up dynamic string growth and appending */
2233 if (ds & CRAM_RL) {
2234 cr->seq = BLOCK_SIZE(s->seqs_blk);
2235 BLOCK_GROW(s->seqs_blk, cr->len);
2236 seq = (char *)BLOCK_END(s->seqs_blk);
2237 BLOCK_SIZE(s->seqs_blk) += cr->len;
2238
2239 if (!seq)
2240 return -1;
2241
2242 cr->qual = BLOCK_SIZE(s->qual_blk);
2243 BLOCK_GROW(s->qual_blk, cr->len);
2244 qual = (char *)BLOCK_END(s->qual_blk);
2245 BLOCK_SIZE(s->qual_blk) += cr->len;
2246
2247 if (!s->ref)
2248 memset(seq, '=', cr->len);
2249 }
2250
2251 if (!(bf & BAM_FUNMAP)) {
2252 /* Decode sequence and generate CIGAR */
2253 if (ds & (CRAM_SEQ | CRAM_MQ)) {
2254 r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual);
2255 } else {
2256 cr->cigar = 0;
2257 cr->ncigar = 0;
2258 cr->aend = cr->apos;
2259 cr->mqual = 0;
2260 }
2261 } else {
2262 int out_sz2 = cr->len;
2263
2264 //puts("Unmapped");
2265 cr->cigar = 0;
2266 cr->ncigar = 0;
2267 cr->aend = cr->apos;
2268 cr->mqual = 0;
2269
2270 if (ds & CRAM_BA) {
2271 if (!c->comp_hdr->codecs[DS_BA]) return -1;
2272 r |= c->comp_hdr->codecs[DS_BA]
2273 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2274 (char *)seq, &out_sz2);
2275 }
2276
2277 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2278 out_sz2 = cr->len;
2279 if (ds & CRAM_QS) {
2280 if (!c->comp_hdr->codecs[DS_QS]) return -1;
2281 r |= c->comp_hdr->codecs[DS_QS]
2282 ->decode(s, c->comp_hdr->codecs[DS_QS],
2283 blk, qual, &out_sz2);
2284 }
2285 } else {
2286 if (ds & CRAM_RL)
2287 memset(qual, 30, cr->len);
2288 }
2289 }
2290 }
2291
2292 pthread_mutex_lock(&fd->ref_lock);
2293 if (refs) {
2294 int i;
2295 for (i = 0; i < fd->refs->nref; i++) {
2296 if (refs[i])
2297 cram_ref_decr(fd->refs, i);
2298 }
2299 free(refs);
2300 } else if (ref_id >= 0 && s->ref != fd->ref_free) {
2301 cram_ref_decr(fd->refs, ref_id);
2302 }
2303 pthread_mutex_unlock(&fd->ref_lock);
2304
2305 /* Resolve mate pair cross-references between recs within this slice */
2306 cram_decode_slice_xref(s, fd->required_fields);
2307
2308 return r;
2309 }
2310
2311 typedef struct {
2312 cram_fd *fd;
2313 cram_container *c;
2314 cram_slice *s;
2315 SAM_hdr *h;
2316 int exit_code;
2317 } cram_decode_job;
2318
2319 void *cram_decode_slice_thread(void *arg) {
2320 cram_decode_job *j = (cram_decode_job *)arg;
2321
2322 j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2323
2324 return j;
2325 }
2326
2327 /*
2328 * Spawn a multi-threaded version of cram_decode_slice().
2329 */
2330 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2331 SAM_hdr *bfd) {
2332 cram_decode_job *j;
2333 int nonblock;
2334
2335 if (!fd->pool)
2336 return cram_decode_slice(fd, c, s, bfd);
2337
2338 if (!(j = malloc(sizeof(*j))))
2339 return -1;
2340
2341 j->fd = fd;
2342 j->c = c;
2343 j->s = s;
2344 j->h = bfd;
2345
2346 nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
2347
2348 if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2349 j, nonblock)) {
2350 /* Would block */
2351 fd->job_pending = j;
2352 } else {
2353 fd->job_pending = NULL;
2354 }
2355
2356 // flush too
2357 return 0;
2358 }
2359
2360
2361 /* ----------------------------------------------------------------------
2362 * CRAM sequence iterators.
2363 */
2364
2365 /*
2366 * Converts a cram in-memory record into a bam in-memory record. We
2367 * pass a pointer to a bam_seq_t pointer along with the a pointer to
2368 * the allocated size. These can initially be pointers to NULL and zero.
2369 *
2370 * This function will reallocate the bam buffer as required and update
2371 * (*bam)->alloc accordingly, allowing it to be used within a loop
2372 * efficiently without needing to allocate new bam objects over and
2373 * over again.
2374 *
2375 * Returns the used size of the bam record on success
2376 * -1 on failure.
2377 */
2378 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
2379 cram_record *cr, int rec, bam_seq_t **bam) {
2380 int bam_idx, rg_len;
2381 char name_a[1024], *name;
2382 int name_len;
2383 char *aux, *aux_orig;
2384 char *seq, *qual;
2385
2386 /* Assign names if not explicitly set */
2387 if (fd->required_fields & SAM_QNAME) {
2388 if (cr->name_len) {
2389 name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
2390 name_len = cr->name_len;
2391 } else {
2392 name = name_a;
2393 name_len = strlen(fd->prefix);
2394 memcpy(name, fd->prefix, name_len);
2395 name += name_len;
2396 *name++ = ':';
2397 if (cr->mate_line >= 0 && cr->mate_line < rec)
2398 name = (char *)append_uint64((unsigned char *)name,
2399 s->hdr->record_counter +
2400 cr->mate_line + 1);
2401 else
2402 name = (char *)append_uint64((unsigned char *)name,
2403 s->hdr->record_counter +
2404 rec + 1);
2405 name_len = name - name_a;
2406 name = name_a;
2407 }
2408 } else {
2409 name = "?";
2410 name_len = 1;
2411 }
2412
2413 /* Generate BAM record */
2414 if (cr->rg < -1 || cr->rg >= bfd->nrg)
2415 return -1;
2416 rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
2417
2418 if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
2419 if (!BLOCK_DATA(s->seqs_blk))
2420 return -1;
2421 seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
2422 } else {
2423 seq = "*";
2424 cr->len = 1;
2425 }
2426
2427
2428 if (fd->required_fields & SAM_QUAL) {
2429 if (!BLOCK_DATA(s->qual_blk))
2430 return -1;
2431 qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
2432 } else {
2433 qual = NULL;
2434 }
2435
2436 bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
2437 name, name_len,
2438 cr->flags,
2439 cr->ref_id,
2440 cr->apos,
2441 cr->aend,
2442 cr->mqual,
2443 cr->ncigar, &s->cigar[cr->cigar],
2444 cr->mate_ref_id,
2445 cr->mate_pos,
2446 cr->tlen,
2447 cr->len,
2448 seq,
2449 qual);
2450 if (bam_idx == -1)
2451 return -1;
2452
2453 aux = aux_orig = (char *)bam_aux(*bam);
2454
2455 /* Auxiliary strings */
2456 if (cr->aux_size != 0) {
2457 memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
2458 aux += cr->aux_size;
2459 }
2460
2461 /* RG:Z: */
2462 if (cr->rg != -1) {
2463 int len = bfd->rg[cr->rg].name_len;
2464 *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
2465 memcpy(aux, bfd->rg[cr->rg].name, len);
2466 aux += len;
2467 *aux++ = 0;
2468 }
2469
2470 return bam_idx + (aux - aux_orig);
2471 }
2472
2473 /*
2474 * Here be dragons! The multi-threading code in this is crufty beyond belief.
2475 */
2476 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
2477 cram_container *c;
2478 cram_slice *s = NULL;
2479
2480 if (!(c = fd->ctr)) {
2481 // Load first container.
2482 do {
2483 if (!(c = fd->ctr = cram_read_container(fd)))
2484 return NULL;
2485 } while (c->length == 0);
2486
2487 /*
2488 * The first container may be a result of a sub-range query.
2489 * In which case it may still not be the optimal starting point
2490 * due to skipped containers/slices in the index.
2491 */
2492 if (fd->range.refid != -2) {
2493 while (c->ref_seq_id != -2 &&
2494 (c->ref_seq_id < fd->range.refid ||
2495 c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
2496 if (0 != cram_seek(fd, c->length, SEEK_CUR))
2497 return NULL;
2498 cram_free_container(fd->ctr);
2499 do {
2500 if (!(c = fd->ctr = cram_read_container(fd)))
2501 return NULL;
2502 } while (c->length == 0);
2503 }
2504
2505 if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
2506 return NULL;
2507 }
2508
2509 if (!(c->comp_hdr_block = cram_read_block(fd)))
2510 return NULL;
2511 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
2512 return NULL;
2513
2514 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
2515 if (!c->comp_hdr)
2516 return NULL;
2517 if (!c->comp_hdr->AP_delta) {
2518 pthread_mutex_lock(&fd->ref_lock);
2519 fd->unsorted = 1;
2520 pthread_mutex_unlock(&fd->ref_lock);
2521 }
2522 }
2523
2524 if ((s = c->slice)) {
2525 c->slice = NULL;
2526 cram_free_slice(s);
2527 s = NULL;
2528 }
2529
2530 if (c->curr_slice == c->max_slice) {
2531 cram_free_container(c);
2532 c = NULL;
2533 }
2534
2535 /* Sorry this is so contorted! */
2536 for (;;) {
2537 if (fd->job_pending) {
2538 cram_decode_job *j = (cram_decode_job *)fd->job_pending;
2539 c = j->c;
2540 s = j->s;
2541 free(fd->job_pending);
2542 fd->job_pending = NULL;
2543 } else if (!fd->ooc) {
2544 empty_container:
2545 if (!c || c->curr_slice == c->max_slice) {
2546 // new container
2547 do {
2548 if (!(c = fd->ctr = cram_read_container(fd))) {
2549 if (fd->pool) {
2550 fd->ooc = 1;
2551 break;
2552 }
2553
2554 return NULL;
2555 }
2556 } while (c->length == 0);
2557 if (fd->ooc)
2558 break;
2559
2560 /* Skip containers not yet spanning our range */
2561 if (fd->range.refid != -2 && c->ref_seq_id != -2) {
2562 fd->required_fields |= SAM_POS;
2563
2564 if (c->ref_seq_id != fd->range.refid) {
2565 cram_free_container(c);
2566 fd->ctr = NULL;
2567 fd->ooc = 1;
2568 fd->eof = 1;
2569 break;
2570 }
2571
2572 if (c->ref_seq_start > fd->range.end) {
2573 cram_free_container(c);
2574 fd->ctr = NULL;
2575 fd->ooc = 1;
2576 fd->eof = 1;
2577 break;
2578 }
2579
2580 if (c->ref_seq_start + c->ref_seq_span-1 <
2581 fd->range.start) {
2582 c->curr_rec = c->max_rec;
2583 c->curr_slice = c->max_slice;
2584 cram_seek(fd, c->length, SEEK_CUR);
2585 cram_free_container(c);
2586 c = NULL;
2587 continue;
2588 }
2589 }
2590
2591 if (!(c->comp_hdr_block = cram_read_block(fd)))
2592 return NULL;
2593 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
2594 return NULL;
2595
2596 c->comp_hdr =
2597 cram_decode_compression_header(fd, c->comp_hdr_block);
2598 if (!c->comp_hdr)
2599 return NULL;
2600
2601 if (!c->comp_hdr->AP_delta) {
2602 pthread_mutex_lock(&fd->ref_lock);
2603 fd->unsorted = 1;
2604 pthread_mutex_unlock(&fd->ref_lock);
2605 }
2606 }
2607
2608 if (c->num_records == 0) {
2609 cram_free_container(c); c = NULL;
2610 goto empty_container;
2611 }
2612
2613
2614 if (!(s = c->slice = cram_read_slice(fd)))
2615 return NULL;
2616 c->curr_slice++;
2617 c->curr_rec = 0;
2618 c->max_rec = s->hdr->num_records;
2619
2620 s->last_apos = s->hdr->ref_seq_start;
2621
2622 /* Skip slices not yet spanning our range */
2623 if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
2624 if (s->hdr->ref_seq_id != fd->range.refid) {
2625 fd->eof = 1;
2626 cram_free_slice(s);
2627 c->slice = NULL;
2628 return NULL;
2629 }
2630
2631 if (s->hdr->ref_seq_start > fd->range.end) {
2632 fd->eof = 1;
2633 cram_free_slice(s);
2634 c->slice = NULL;
2635 return NULL;
2636 }
2637
2638 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
2639 fd->range.start) {
2640 cram_free_slice(s);
2641 c->slice = NULL;
2642 cram_free_container(c);
2643 c = NULL;
2644 continue;
2645 }
2646 }
2647 }
2648
2649 /* Test decoding of 1st seq */
2650 if (!c || !s)
2651 break;
2652
2653 if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
2654 // if (cram_decode_slice(fd, c, s, fd->header) != 0) {
2655 fprintf(stderr, "Failure to decode slice\n");
2656 cram_free_slice(s);
2657 c->slice = NULL;
2658 return NULL;
2659 }
2660
2661 if (!fd->pool || fd->job_pending)
2662 break;
2663
2664 // Push it a bit far, to qsize in queue rather than pending arrival,
2665 // as cram tends to be a bit bursty in decode timings.
2666 if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
2667 break;
2668 }
2669
2670 if (fd->pool) {
2671 t_pool_result *res;
2672 cram_decode_job *j;
2673
2674 // fprintf(stderr, "Thread pool len = %d, %d\n",
2675 // t_pool_results_queue_len(fd->rqueue),
2676 // t_pool_results_queue_sz(fd->rqueue));
2677
2678 if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
2679 return NULL;
2680
2681 res = t_pool_next_result_wait(fd->rqueue);
2682
2683 if (!res || !res->data) {
2684 fprintf(stderr, "t_pool_next_result failure\n");
2685 return NULL;
2686 }
2687
2688 j = (cram_decode_job *)res->data;
2689 c = j->c;
2690 s = j->s;
2691
2692 fd->ctr = c;
2693
2694 t_pool_delete_result(res, 1);
2695 }
2696
2697 *cp = c;
2698 return s;
2699 }
2700
2701 /*
2702 * Read the next cram record and return it.
2703 * Note that to decode cram_record the caller will need to look up some data
2704 * in the current slice, pointed to by fd->ctr->slice. This is valid until
2705 * the next call to cram_get_seq (which may invalidate it).
2706 *
2707 * Returns record pointer on success (do not free)
2708 * NULL on failure
2709 */
2710 cram_record *cram_get_seq(cram_fd *fd) {
2711 cram_container *c;
2712 cram_slice *s;
2713
2714 for (;;) {
2715 c = fd->ctr;
2716 if (c && c->slice && c->curr_rec < c->max_rec) {
2717 s = c->slice;
2718 } else {
2719 if (!(s = cram_next_slice(fd, &c)))
2720 return NULL;
2721 }
2722
2723 if (fd->range.refid != -2) {
2724 if (s->crecs[c->curr_rec].ref_id < fd->range.refid) {
2725 c->curr_rec++;
2726 continue;
2727 }
2728
2729 if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
2730 fd->eof = 1;
2731 cram_free_slice(s);
2732 c->slice = NULL;
2733 return NULL;
2734 }
2735
2736 if (s->crecs[c->curr_rec].apos > fd->range.end) {
2737 fd->eof = 1;
2738 cram_free_slice(s);
2739 c->slice = NULL;
2740 return NULL;
2741 }
2742
2743 if (s->crecs[c->curr_rec].aend < fd->range.start) {
2744 c->curr_rec++;
2745 continue;
2746 }
2747 }
2748
2749 break;
2750 }
2751
2752 fd->ctr = c;
2753 c->slice = s;
2754 return &s->crecs[c->curr_rec++];
2755 }
2756
2757 /*
2758 * Read the next cram record and convert it to a bam_seq_t struct.
2759 *
2760 * Returns 0 on success
2761 * -1 on EOF or failure (check fd->err)
2762 */
2763 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
2764 cram_record *cr;
2765 cram_container *c;
2766 cram_slice *s;
2767
2768 if (!(cr = cram_get_seq(fd)))
2769 return -1;
2770
2771 c = fd->ctr;
2772 s = c->slice;
2773
2774 return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
2775 }