0
|
1 /*
|
|
2 Copyright (c) 2012-2014 Genome Research Ltd.
|
|
3 Author: James Bonfield <jkb@sanger.ac.uk>
|
|
4
|
|
5 Redistribution and use in source and binary forms, with or without
|
|
6 modification, are permitted provided that the following conditions are met:
|
|
7
|
|
8 1. Redistributions of source code must retain the above copyright notice,
|
|
9 this list of conditions and the following disclaimer.
|
|
10
|
|
11 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12 this list of conditions and the following disclaimer in the documentation
|
|
13 and/or other materials provided with the distribution.
|
|
14
|
|
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
16 Institute nor the names of its contributors may be used to endorse or promote
|
|
17 products derived from this software without specific prior written permission.
|
|
18
|
|
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
|
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
|
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29 */
|
|
30
|
|
31 /*
|
|
32 * - In-memory decoding of CRAM data structures.
|
|
33 * - Iterator for reading CRAM record by record.
|
|
34 */
|
|
35
|
|
36 #ifdef HAVE_CONFIG_H
|
|
37 #include "io_lib_config.h"
|
|
38 #endif
|
|
39
|
|
40 #include <stdio.h>
|
|
41 #include <errno.h>
|
|
42 #include <assert.h>
|
|
43 #include <stdlib.h>
|
|
44 #include <string.h>
|
|
45 #include <zlib.h>
|
|
46 #include <sys/types.h>
|
|
47 #include <sys/stat.h>
|
|
48 #include <math.h>
|
|
49 #include <ctype.h>
|
|
50
|
|
51 #include "cram/cram.h"
|
|
52 #include "cram/os.h"
|
|
53 #include "cram/md5.h"
|
|
54
|
|
55 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
|
|
56 //#define USE_X
|
|
57
|
|
58 /* ----------------------------------------------------------------------
|
|
59 * CRAM compression headers
|
|
60 */
|
|
61
|
|
62 /*
|
|
63 * Decodes the Tag Dictionary record in the preservation map
|
|
64 * Updates the cram compression header.
|
|
65 *
|
|
66 * Returns number of bytes decoded on success
|
|
67 * -1 on failure
|
|
68 */
|
|
69 int cram_decode_TD(char *cp, cram_block_compression_hdr *h) {
|
|
70 char *op = cp;
|
|
71 unsigned char *dat;
|
|
72 cram_block *b;
|
|
73 int32_t blk_size;
|
|
74 int nTL, i, sz;
|
|
75
|
|
76 if (!(b = cram_new_block(0, 0)))
|
|
77 return -1;
|
|
78 h->TD_blk = b;
|
|
79
|
|
80 /* Decode */
|
|
81 cp += itf8_get(cp, &blk_size);
|
|
82 if (!blk_size) {
|
|
83 h->nTL = 0;
|
|
84 h->TL = NULL;
|
|
85 cram_free_block(b);
|
|
86 return cp - op;
|
|
87 }
|
|
88
|
|
89 BLOCK_APPEND(b, cp, blk_size);
|
|
90 cp += blk_size;
|
|
91 sz = cp - op;
|
|
92
|
|
93 // Force nul termination if missing
|
|
94 if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
|
|
95 BLOCK_APPEND_CHAR(b, '\0');
|
|
96
|
|
97 /* Set up TL lookup table */
|
|
98 dat = BLOCK_DATA(b);
|
|
99
|
|
100 // Count
|
|
101 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
|
|
102 nTL++;
|
|
103 while (dat[i])
|
|
104 i++;
|
|
105 }
|
|
106
|
|
107 // Copy
|
|
108 h->nTL = nTL;
|
|
109 if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *))))
|
|
110 return -1;
|
|
111 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
|
|
112 h->TL[nTL++] = &dat[i];
|
|
113 while (dat[i])
|
|
114 i++;
|
|
115 }
|
|
116
|
|
117 return sz;
|
|
118 }
|
|
119
|
|
120 /*
|
|
121 * Decodes a CRAM block compression header.
|
|
122 * Returns header ptr on success
|
|
123 * NULL on failure
|
|
124 */
|
|
125 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
|
|
126 cram_block *b) {
|
|
127 char *cp, *cp_copy;
|
|
128 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
|
|
129 int i;
|
|
130 int32_t map_size, map_count;
|
|
131
|
|
132 if (!hdr)
|
|
133 return NULL;
|
|
134
|
|
135 if (b->method != RAW) {
|
|
136 if (cram_uncompress_block(b)) {
|
|
137 free(hdr);
|
|
138 return NULL;
|
|
139 }
|
|
140 }
|
|
141
|
|
142 cp = (char *)b->data;
|
|
143
|
|
144 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
145 cp += itf8_get(cp, &hdr->ref_seq_id);
|
|
146 cp += itf8_get(cp, &hdr->ref_seq_start);
|
|
147 cp += itf8_get(cp, &hdr->ref_seq_span);
|
|
148 cp += itf8_get(cp, &hdr->num_records);
|
|
149 cp += itf8_get(cp, &hdr->num_landmarks);
|
|
150 if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
|
|
151 free(hdr);
|
|
152 return NULL;
|
|
153 }
|
|
154 for (i = 0; i < hdr->num_landmarks; i++) {
|
|
155 cp += itf8_get(cp, &hdr->landmark[i]);
|
|
156 }
|
|
157 }
|
|
158
|
|
159 hdr->preservation_map = kh_init(map);
|
|
160
|
|
161 memset(hdr->rec_encoding_map, 0,
|
|
162 CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
|
|
163 memset(hdr->tag_encoding_map, 0,
|
|
164 CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
|
|
165
|
|
166 if (!hdr->preservation_map) {
|
|
167 cram_free_compression_header(hdr);
|
|
168 return NULL;
|
|
169 }
|
|
170
|
|
171 /* Initialise defaults for preservation map */
|
|
172 hdr->mapped_qs_included = 0;
|
|
173 hdr->unmapped_qs_included = 0;
|
|
174 hdr->unmapped_placed = 0;
|
|
175 hdr->qs_included = 0;
|
|
176 hdr->read_names_included = 0;
|
|
177 hdr->AP_delta = 1;
|
|
178 memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
|
|
179
|
|
180 /* Preservation map */
|
|
181 cp += itf8_get(cp, &map_size); cp_copy = cp;
|
|
182 cp += itf8_get(cp, &map_count);
|
|
183 for (i = 0; i < map_count; i++) {
|
|
184 pmap_t hd;
|
|
185 khint_t k;
|
|
186 int r;
|
|
187
|
|
188 cp += 2;
|
|
189 switch(CRAM_KEY(cp[-2],cp[-1])) {
|
|
190 case CRAM_KEY('M','I'):
|
|
191 hd.i = *cp++;
|
|
192 k = kh_put(map, hdr->preservation_map, "MI", &r);
|
|
193 if (-1 == r) {
|
|
194 cram_free_compression_header(hdr);
|
|
195 return NULL;
|
|
196 }
|
|
197
|
|
198 kh_val(hdr->preservation_map, k) = hd;
|
|
199 hdr->mapped_qs_included = hd.i;
|
|
200 break;
|
|
201
|
|
202 case CRAM_KEY('U','I'):
|
|
203 hd.i = *cp++;
|
|
204 k = kh_put(map, hdr->preservation_map, "UI", &r);
|
|
205 if (-1 == r) {
|
|
206 cram_free_compression_header(hdr);
|
|
207 return NULL;
|
|
208 }
|
|
209
|
|
210 kh_val(hdr->preservation_map, k) = hd;
|
|
211 hdr->unmapped_qs_included = hd.i;
|
|
212 break;
|
|
213
|
|
214 case CRAM_KEY('P','I'):
|
|
215 hd.i = *cp++;
|
|
216 k = kh_put(map, hdr->preservation_map, "PI", &r);
|
|
217 if (-1 == r) {
|
|
218 cram_free_compression_header(hdr);
|
|
219 return NULL;
|
|
220 }
|
|
221
|
|
222 kh_val(hdr->preservation_map, k) = hd;
|
|
223 hdr->unmapped_placed = hd.i;
|
|
224 break;
|
|
225
|
|
226 case CRAM_KEY('R','N'):
|
|
227 hd.i = *cp++;
|
|
228 k = kh_put(map, hdr->preservation_map, "RN", &r);
|
|
229 if (-1 == r) {
|
|
230 cram_free_compression_header(hdr);
|
|
231 return NULL;
|
|
232 }
|
|
233
|
|
234 kh_val(hdr->preservation_map, k) = hd;
|
|
235 hdr->read_names_included = hd.i;
|
|
236 break;
|
|
237
|
|
238 case CRAM_KEY('A','P'):
|
|
239 hd.i = *cp++;
|
|
240 k = kh_put(map, hdr->preservation_map, "AP", &r);
|
|
241 if (-1 == r) {
|
|
242 cram_free_compression_header(hdr);
|
|
243 return NULL;
|
|
244 }
|
|
245
|
|
246 kh_val(hdr->preservation_map, k) = hd;
|
|
247 hdr->AP_delta = hd.i;
|
|
248 break;
|
|
249
|
|
250 case CRAM_KEY('R','R'):
|
|
251 hd.i = *cp++;
|
|
252 k = kh_put(map, hdr->preservation_map, "RR", &r);
|
|
253 if (-1 == r) {
|
|
254 cram_free_compression_header(hdr);
|
|
255 return NULL;
|
|
256 }
|
|
257
|
|
258 kh_val(hdr->preservation_map, k) = hd;
|
|
259 fd->no_ref = !hd.i;
|
|
260 break;
|
|
261
|
|
262 case CRAM_KEY('S','M'):
|
|
263 hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
|
|
264 hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
|
|
265 hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
|
|
266 hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
|
|
267
|
|
268 hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
|
|
269 hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
|
|
270 hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
|
|
271 hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
|
|
272
|
|
273 hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
|
|
274 hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
|
|
275 hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
|
|
276 hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
|
|
277
|
|
278 hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
|
|
279 hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
|
|
280 hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
|
|
281 hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
|
|
282
|
|
283 hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
|
|
284 hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
|
|
285 hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
|
|
286 hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
|
|
287
|
|
288 hd.p = cp;
|
|
289 cp += 5;
|
|
290
|
|
291 k = kh_put(map, hdr->preservation_map, "SM", &r);
|
|
292 if (-1 == r) {
|
|
293 cram_free_compression_header(hdr);
|
|
294 return NULL;
|
|
295 }
|
|
296 kh_val(hdr->preservation_map, k) = hd;
|
|
297 break;
|
|
298
|
|
299 case CRAM_KEY('T','D'): {
|
|
300 int sz = cram_decode_TD(cp, hdr); // tag dictionary
|
|
301 if (sz < 0) {
|
|
302 cram_free_compression_header(hdr);
|
|
303 return NULL;
|
|
304 }
|
|
305
|
|
306 hd.p = cp;
|
|
307 cp += sz;
|
|
308
|
|
309 k = kh_put(map, hdr->preservation_map, "TD", &r);
|
|
310 if (-1 == r) {
|
|
311 cram_free_compression_header(hdr);
|
|
312 return NULL;
|
|
313 }
|
|
314 kh_val(hdr->preservation_map, k) = hd;
|
|
315 break;
|
|
316 }
|
|
317
|
|
318 default:
|
|
319 fprintf(stderr, "Unrecognised preservation map key %c%c\n",
|
|
320 cp[-2], cp[-1]);
|
|
321 // guess byte;
|
|
322 cp++;
|
|
323 break;
|
|
324 }
|
|
325 }
|
|
326 if (cp - cp_copy != map_size) {
|
|
327 cram_free_compression_header(hdr);
|
|
328 return NULL;
|
|
329 }
|
|
330
|
|
331 /* Record encoding map */
|
|
332 cp += itf8_get(cp, &map_size); cp_copy = cp;
|
|
333 cp += itf8_get(cp, &map_count);
|
|
334 for (i = 0; i < map_count; i++) {
|
|
335 char *key = cp;
|
|
336 int32_t encoding;
|
|
337 int32_t size;
|
|
338 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
|
|
339
|
|
340 if (!m) {
|
|
341 cram_free_compression_header(hdr);
|
|
342 return NULL;
|
|
343 }
|
|
344
|
|
345 cp += 2;
|
|
346 cp += itf8_get(cp, &encoding);
|
|
347 cp += itf8_get(cp, &size);
|
|
348
|
|
349 // Fill out cram_map purely for cram_dump to dump out.
|
|
350 m->key = (key[0]<<8)|key[1];
|
|
351 m->encoding = encoding;
|
|
352 m->size = size;
|
|
353 m->offset = cp - (char *)b->data;
|
|
354 m->codec = NULL;
|
|
355
|
|
356 if (m->encoding == E_NULL)
|
|
357 continue;
|
|
358
|
|
359 //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
|
|
360
|
|
361 /*
|
|
362 * For CRAM1.0 CF and BF are Byte and not Int.
|
|
363 * Practically speaking it makes no difference unless we have a
|
|
364 * 1.0 format file that stores these in EXTERNAL as only then
|
|
365 * does Byte vs Int matter.
|
|
366 *
|
|
367 * Neither this C code nor Java reference implementations did this,
|
|
368 * so we gloss over it and treat them as int.
|
|
369 */
|
|
370
|
|
371 if (key[0] == 'B' && key[1] == 'F') {
|
|
372 if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
|
|
373 E_INT,
|
|
374 fd->version))) {
|
|
375 cram_free_compression_header(hdr);
|
|
376 return NULL;
|
|
377 }
|
|
378 } else if (key[0] == 'C' && key[1] == 'F') {
|
|
379 if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
|
|
380 E_INT,
|
|
381 fd->version))) {
|
|
382 cram_free_compression_header(hdr);
|
|
383 return NULL;
|
|
384 }
|
|
385 } else if (key[0] == 'R' && key[1] == 'I') {
|
|
386 if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
|
|
387 E_INT,
|
|
388 fd->version))) {
|
|
389 cram_free_compression_header(hdr);
|
|
390 return NULL;
|
|
391 }
|
|
392 } else if (key[0] == 'R' && key[1] == 'L') {
|
|
393 if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
|
|
394 E_INT,
|
|
395 fd->version))) {
|
|
396 cram_free_compression_header(hdr);
|
|
397 return NULL;
|
|
398 }
|
|
399 } else if (key[0] == 'A' && key[1] == 'P') {
|
|
400 if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
|
|
401 E_INT,
|
|
402 fd->version))) {
|
|
403 cram_free_compression_header(hdr);
|
|
404 return NULL;
|
|
405 }
|
|
406 } else if (key[0] == 'R' && key[1] == 'G') {
|
|
407 if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
|
|
408 E_INT,
|
|
409 fd->version))) {
|
|
410 cram_free_compression_header(hdr);
|
|
411 return NULL;
|
|
412 }
|
|
413 } else if (key[0] == 'M' && key[1] == 'F') {
|
|
414 if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
|
|
415 E_INT,
|
|
416 fd->version))) {
|
|
417 cram_free_compression_header(hdr);
|
|
418 return NULL;
|
|
419 }
|
|
420 } else if (key[0] == 'N' && key[1] == 'S') {
|
|
421 if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
|
|
422 E_INT,
|
|
423 fd->version))) {
|
|
424 cram_free_compression_header(hdr);
|
|
425 return NULL;
|
|
426 }
|
|
427 } else if (key[0] == 'N' && key[1] == 'P') {
|
|
428 if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
|
|
429 E_INT,
|
|
430 fd->version))) {
|
|
431 cram_free_compression_header(hdr);
|
|
432 return NULL;
|
|
433 }
|
|
434 } else if (key[0] == 'T' && key[1] == 'S') {
|
|
435 if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
|
|
436 E_INT,
|
|
437 fd->version))) {
|
|
438 cram_free_compression_header(hdr);
|
|
439 return NULL;
|
|
440 }
|
|
441 } else if (key[0] == 'N' && key[1] == 'F') {
|
|
442 if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
|
|
443 E_INT,
|
|
444 fd->version))) {
|
|
445 cram_free_compression_header(hdr);
|
|
446 return NULL;
|
|
447 }
|
|
448 } else if (key[0] == 'T' && key[1] == 'C') {
|
|
449 if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
|
|
450 E_BYTE,
|
|
451 fd->version))) {
|
|
452 cram_free_compression_header(hdr);
|
|
453 return NULL;
|
|
454 }
|
|
455 } else if (key[0] == 'T' && key[1] == 'N') {
|
|
456 if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
|
|
457 E_INT,
|
|
458 fd->version))) {
|
|
459 cram_free_compression_header(hdr);
|
|
460 return NULL;
|
|
461 }
|
|
462 } else if (key[0] == 'F' && key[1] == 'N') {
|
|
463 if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
|
|
464 E_INT,
|
|
465 fd->version))) {
|
|
466 cram_free_compression_header(hdr);
|
|
467 return NULL;
|
|
468 }
|
|
469 } else if (key[0] == 'F' && key[1] == 'C') {
|
|
470 if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
|
|
471 E_BYTE,
|
|
472 fd->version))) {
|
|
473 cram_free_compression_header(hdr);
|
|
474 return NULL;
|
|
475 }
|
|
476 } else if (key[0] == 'F' && key[1] == 'P') {
|
|
477 if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
|
|
478 E_INT,
|
|
479 fd->version))) {
|
|
480 cram_free_compression_header(hdr);
|
|
481 return NULL;
|
|
482 }
|
|
483 } else if (key[0] == 'B' && key[1] == 'S') {
|
|
484 if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
|
|
485 E_BYTE,
|
|
486 fd->version))) {
|
|
487 cram_free_compression_header(hdr);
|
|
488 return NULL;
|
|
489 }
|
|
490 } else if (key[0] == 'I' && key[1] == 'N') {
|
|
491 if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
|
|
492 E_BYTE_ARRAY,
|
|
493 fd->version))) {
|
|
494 cram_free_compression_header(hdr);
|
|
495 return NULL;
|
|
496 }
|
|
497 } else if (key[0] == 'S' && key[1] == 'C') {
|
|
498 if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
|
|
499 E_BYTE_ARRAY,
|
|
500 fd->version))) {
|
|
501 cram_free_compression_header(hdr);
|
|
502 return NULL;
|
|
503 }
|
|
504 } else if (key[0] == 'D' && key[1] == 'L') {
|
|
505 if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
|
|
506 E_INT,
|
|
507 fd->version))) {
|
|
508 cram_free_compression_header(hdr);
|
|
509 return NULL;
|
|
510 }
|
|
511 } else if (key[0] == 'B' && key[1] == 'A') {
|
|
512 if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
|
|
513 E_BYTE,
|
|
514 fd->version))) {
|
|
515 cram_free_compression_header(hdr);
|
|
516 return NULL;
|
|
517 }
|
|
518 } else if (key[0] == 'B' && key[1] == 'B') {
|
|
519 if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
|
|
520 E_BYTE_ARRAY,
|
|
521 fd->version))) {
|
|
522 cram_free_compression_header(hdr);
|
|
523 return NULL;
|
|
524 }
|
|
525 } else if (key[0] == 'R' && key[1] == 'S') {
|
|
526 if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
|
|
527 E_INT,
|
|
528 fd->version))) {
|
|
529 cram_free_compression_header(hdr);
|
|
530 return NULL;
|
|
531 }
|
|
532 } else if (key[0] == 'P' && key[1] == 'D') {
|
|
533 if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
|
|
534 E_INT,
|
|
535 fd->version))) {
|
|
536 cram_free_compression_header(hdr);
|
|
537 return NULL;
|
|
538 }
|
|
539 } else if (key[0] == 'H' && key[1] == 'C') {
|
|
540 if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
|
|
541 E_INT,
|
|
542 fd->version))) {
|
|
543 cram_free_compression_header(hdr);
|
|
544 return NULL;
|
|
545 }
|
|
546 } else if (key[0] == 'M' && key[1] == 'Q') {
|
|
547 if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
|
|
548 E_INT,
|
|
549 fd->version))) {
|
|
550 cram_free_compression_header(hdr);
|
|
551 return NULL;
|
|
552 }
|
|
553 } else if (key[0] == 'R' && key[1] == 'N') {
|
|
554 if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
|
|
555 E_BYTE_ARRAY_BLOCK,
|
|
556 fd->version))) {
|
|
557 cram_free_compression_header(hdr);
|
|
558 return NULL;
|
|
559 }
|
|
560 } else if (key[0] == 'Q' && key[1] == 'S') {
|
|
561 if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
|
|
562 E_BYTE,
|
|
563 fd->version))) {
|
|
564 cram_free_compression_header(hdr);
|
|
565 return NULL;
|
|
566 }
|
|
567 } else if (key[0] == 'Q' && key[1] == 'Q') {
|
|
568 if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
|
|
569 E_BYTE_ARRAY,
|
|
570 fd->version))) {
|
|
571 cram_free_compression_header(hdr);
|
|
572 return NULL;
|
|
573 }
|
|
574 } else if (key[0] == 'T' && key[1] == 'L') {
|
|
575 if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
|
|
576 E_INT,
|
|
577 fd->version))) {
|
|
578 cram_free_compression_header(hdr);
|
|
579 return NULL;
|
|
580 }
|
|
581 } else if (key[0] == 'T' && key[1] == 'M') {
|
|
582 } else if (key[0] == 'T' && key[1] == 'V') {
|
|
583 } else
|
|
584 fprintf(stderr, "Unrecognised key: %.2s\n", key);
|
|
585
|
|
586 cp += size;
|
|
587
|
|
588 m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
|
|
589 hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
|
|
590 }
|
|
591 if (cp - cp_copy != map_size) {
|
|
592 cram_free_compression_header(hdr);
|
|
593 return NULL;
|
|
594 }
|
|
595
|
|
596 /* Tag encoding map */
|
|
597 cp += itf8_get(cp, &map_size); cp_copy = cp;
|
|
598 cp += itf8_get(cp, &map_count);
|
|
599 for (i = 0; i < map_count; i++) {
|
|
600 int32_t encoding;
|
|
601 int32_t size;
|
|
602 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
|
|
603 char *key = cp+1;
|
|
604
|
|
605 if (!m) {
|
|
606 cram_free_compression_header(hdr);
|
|
607 return NULL;
|
|
608 }
|
|
609
|
|
610 m->key = (key[0]<<16)|(key[1]<<8)|key[2];
|
|
611
|
|
612 cp += 4; // Strictly ITF8, but this suffices
|
|
613 cp += itf8_get(cp, &encoding);
|
|
614 cp += itf8_get(cp, &size);
|
|
615
|
|
616 m->encoding = encoding;
|
|
617 m->size = size;
|
|
618 m->offset = cp - (char *)b->data;
|
|
619 if (!(m->codec = cram_decoder_init(encoding, cp, size,
|
|
620 E_BYTE_ARRAY_BLOCK, fd->version))) {
|
|
621 cram_free_compression_header(hdr);
|
|
622 free(m);
|
|
623 return NULL;
|
|
624 }
|
|
625
|
|
626 cp += size;
|
|
627
|
|
628 m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
|
|
629 hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
|
|
630 }
|
|
631 if (cp - cp_copy != map_size) {
|
|
632 cram_free_compression_header(hdr);
|
|
633 return NULL;
|
|
634 }
|
|
635
|
|
636 return hdr;
|
|
637 }
|
|
638
|
|
639 /*
|
|
640 * Note we also need to scan through the record encoding map to
|
|
641 * see which data series share the same block, either external or
|
|
642 * CORE. For example if we need the BF data series but MQ and CF
|
|
643 * are also encoded in the same block then we need to add those in
|
|
644 * as a dependency in order to correctly decode BF.
|
|
645 *
|
|
646 * Returns 0 on success
|
|
647 * -1 on failure
|
|
648 */
|
|
649 int cram_dependent_data_series(cram_fd *fd,
|
|
650 cram_block_compression_hdr *hdr,
|
|
651 cram_slice *s) {
|
|
652 int *block_used;
|
|
653 int core_used = 0;
|
|
654 int i;
|
|
655 static int i_to_id[] = {
|
|
656 DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
|
|
657 DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
|
|
658 DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
|
|
659 DS_HC, DS_SC, DS_BB, DS_QQ,
|
|
660 };
|
|
661 uint32_t orig_ds;
|
|
662
|
|
663 /*
|
|
664 * Set the data_series bit field based on fd->required_fields
|
|
665 * contents.
|
|
666 */
|
|
667 if (fd->required_fields && fd->required_fields != INT_MAX) {
|
|
668 hdr->data_series = 0;
|
|
669
|
|
670 if (fd->required_fields & SAM_QNAME)
|
|
671 hdr->data_series |= CRAM_RN;
|
|
672
|
|
673 if (fd->required_fields & SAM_FLAG)
|
|
674 hdr->data_series |= CRAM_BF;
|
|
675
|
|
676 if (fd->required_fields & SAM_RNAME)
|
|
677 hdr->data_series |= CRAM_RI | CRAM_BF;
|
|
678
|
|
679 if (fd->required_fields & SAM_POS)
|
|
680 hdr->data_series |= CRAM_AP | CRAM_BF;
|
|
681
|
|
682 if (fd->required_fields & SAM_MAPQ)
|
|
683 hdr->data_series |= CRAM_MQ;
|
|
684
|
|
685 if (fd->required_fields & SAM_CIGAR)
|
|
686 hdr->data_series |= CRAM_CIGAR;
|
|
687
|
|
688 if (fd->required_fields & SAM_RNEXT)
|
|
689 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
|
|
690
|
|
691 if (fd->required_fields & SAM_PNEXT)
|
|
692 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
|
|
693
|
|
694 if (fd->required_fields & SAM_TLEN)
|
|
695 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
|
|
696 CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
|
|
697
|
|
698 if (fd->required_fields & SAM_SEQ)
|
|
699 hdr->data_series |= CRAM_SEQ;
|
|
700
|
|
701 if (!(fd->required_fields & SAM_AUX))
|
|
702 // No easy way to get MD/NM without other tags at present
|
|
703 fd->decode_md = 0;
|
|
704
|
|
705 if (fd->required_fields & SAM_QUAL)
|
|
706 hdr->data_series |= CRAM_SEQ;
|
|
707
|
|
708 if (fd->required_fields & SAM_AUX)
|
|
709 hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
|
|
710
|
|
711 if (fd->required_fields & SAM_RGAUX)
|
|
712 hdr->data_series |= CRAM_RG | CRAM_BF;
|
|
713
|
|
714 // Always uncompress CORE block
|
|
715 if (cram_uncompress_block(s->block[0]))
|
|
716 return -1;
|
|
717 } else {
|
|
718 hdr->data_series = CRAM_ALL;
|
|
719
|
|
720 for (i = 0; i < s->hdr->num_blocks; i++) {
|
|
721 if (cram_uncompress_block(s->block[i]))
|
|
722 return -1;
|
|
723 }
|
|
724
|
|
725 return 0;
|
|
726 }
|
|
727
|
|
728 block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
|
|
729 if (!block_used)
|
|
730 return -1;
|
|
731
|
|
732 do {
|
|
733 /*
|
|
734 * Also set data_series based on code prerequisites. Eg if we need
|
|
735 * CRAM_QS then we also need to know CRAM_RL so we know how long it
|
|
736 * is, or if we need FC/FP then we also need FN (number of features).
|
|
737 *
|
|
738 * It's not reciprocal though. We may be needing to decode FN
|
|
739 * but have no need to decode FC, FP and cigar ops.
|
|
740 */
|
|
741 if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
742 if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
743 if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
744 if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
745 if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
746 if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
747 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
748 if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
749 if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
750 if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
751 if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP;
|
|
752
|
|
753 // cram_decode_seq() needs seq[] array
|
|
754 if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
|
|
755
|
|
756 if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC;
|
|
757 if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN;
|
|
758 if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL;
|
|
759 if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF;
|
|
760 if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF;
|
|
761 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI;
|
|
762 if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
|
|
763 hdr->data_series |= CRAM_CF;
|
|
764 if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
|
|
765 hdr->data_series |= CRAM_CF | CRAM_NF;
|
|
766 if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
|
|
767 hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
|
|
768
|
|
769 orig_ds = hdr->data_series;
|
|
770
|
|
771 // Find which blocks are in use.
|
|
772 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
|
|
773 int bnum1, bnum2, j;
|
|
774 cram_codec *c = hdr->codecs[i_to_id[i]];
|
|
775
|
|
776 if (!(hdr->data_series & (1<<i)))
|
|
777 continue;
|
|
778
|
|
779 if (!c)
|
|
780 continue;
|
|
781
|
|
782 bnum1 = cram_codec_to_id(c, &bnum2);
|
|
783
|
|
784 for (;;) {
|
|
785 switch (bnum1) {
|
|
786 case -2:
|
|
787 break;
|
|
788
|
|
789 case -1:
|
|
790 core_used = 1;
|
|
791 break;
|
|
792
|
|
793 default:
|
|
794 for (j = 0; j < s->hdr->num_blocks; j++) {
|
|
795 if (s->block[j]->content_type == EXTERNAL &&
|
|
796 s->block[j]->content_id == bnum1) {
|
|
797 block_used[j] = 1;
|
|
798 if (cram_uncompress_block(s->block[j])) {
|
|
799 free(block_used);
|
|
800 return -1;
|
|
801 }
|
|
802 }
|
|
803 }
|
|
804 break;
|
|
805 }
|
|
806
|
|
807 if (bnum2 == -2 || bnum1 == bnum2)
|
|
808 break;
|
|
809
|
|
810 bnum1 = bnum2; // 2nd pass
|
|
811 }
|
|
812 }
|
|
813
|
|
814 // Tags too
|
|
815 if ((fd->required_fields & SAM_AUX) ||
|
|
816 (hdr->data_series & CRAM_aux)) {
|
|
817 for (i = 0; i < CRAM_MAP_HASH; i++) {
|
|
818 int bnum1, bnum2, j;
|
|
819 cram_map *m = hdr->tag_encoding_map[i];
|
|
820
|
|
821 while (m) {
|
|
822 cram_codec *c = m->codec;
|
|
823 if (!c)
|
|
824 continue;
|
|
825
|
|
826 bnum1 = cram_codec_to_id(c, &bnum2);
|
|
827
|
|
828 for (;;) {
|
|
829 switch (bnum1) {
|
|
830 case -2:
|
|
831 break;
|
|
832
|
|
833 case -1:
|
|
834 core_used = 1;
|
|
835 break;
|
|
836
|
|
837 default:
|
|
838 for (j = 0; j < s->hdr->num_blocks; j++) {
|
|
839 if (s->block[j]->content_type == EXTERNAL &&
|
|
840 s->block[j]->content_id == bnum1) {
|
|
841 block_used[j] = 1;
|
|
842 if (cram_uncompress_block(s->block[j])) {
|
|
843 free(block_used);
|
|
844 return -1;
|
|
845 }
|
|
846 }
|
|
847 }
|
|
848 break;
|
|
849 }
|
|
850
|
|
851 if (bnum2 == -2 || bnum1 == bnum2)
|
|
852 break;
|
|
853
|
|
854 bnum1 = bnum2; // 2nd pass
|
|
855 }
|
|
856
|
|
857 m = m->next;
|
|
858 }
|
|
859 }
|
|
860 }
|
|
861
|
|
862 // We now know which blocks are in used, so repeat and find
|
|
863 // which other data series need to be added.
|
|
864 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
|
|
865 int bnum1, bnum2, j;
|
|
866 cram_codec *c = hdr->codecs[i_to_id[i]];
|
|
867
|
|
868 if (!c)
|
|
869 continue;
|
|
870
|
|
871 bnum1 = cram_codec_to_id(c, &bnum2);
|
|
872
|
|
873 for (;;) {
|
|
874 switch (bnum1) {
|
|
875 case -2:
|
|
876 break;
|
|
877
|
|
878 case -1:
|
|
879 if (core_used) {
|
|
880 //printf(" + data series %08x:\n", 1<<i);
|
|
881 hdr->data_series |= 1<<i;
|
|
882 }
|
|
883 break;
|
|
884
|
|
885 default:
|
|
886 for (j = 0; j < s->hdr->num_blocks; j++) {
|
|
887 if (s->block[j]->content_type == EXTERNAL &&
|
|
888 s->block[j]->content_id == bnum1) {
|
|
889 if (block_used[j]) {
|
|
890 //printf(" + data series %08x:\n", 1<<i);
|
|
891 hdr->data_series |= 1<<i;
|
|
892 }
|
|
893 }
|
|
894 }
|
|
895 break;
|
|
896 }
|
|
897
|
|
898 if (bnum2 == -2 || bnum1 == bnum2)
|
|
899 break;
|
|
900
|
|
901 bnum1 = bnum2; // 2nd pass
|
|
902 }
|
|
903 }
|
|
904
|
|
905 // Tags too
|
|
906 for (i = 0; i < CRAM_MAP_HASH; i++) {
|
|
907 int bnum1, bnum2, j;
|
|
908 cram_map *m = hdr->tag_encoding_map[i];
|
|
909
|
|
910 while (m) {
|
|
911 cram_codec *c = m->codec;
|
|
912 if (!c)
|
|
913 continue;
|
|
914
|
|
915 bnum1 = cram_codec_to_id(c, &bnum2);
|
|
916
|
|
917 for (;;) {
|
|
918 switch (bnum1) {
|
|
919 case -2:
|
|
920 break;
|
|
921
|
|
922 case -1:
|
|
923 //printf(" + data series %08x:\n", CRAM_aux);
|
|
924 hdr->data_series |= CRAM_aux;
|
|
925 break;
|
|
926
|
|
927 default:
|
|
928 for (j = 0; j < s->hdr->num_blocks; j++) {
|
|
929 if (s->block[j]->content_type &&
|
|
930 s->block[j]->content_id == bnum1) {
|
|
931 if (block_used[j]) {
|
|
932 //printf(" + data series %08x:\n",
|
|
933 // CRAM_aux);
|
|
934 hdr->data_series |= CRAM_aux;
|
|
935 }
|
|
936 }
|
|
937 }
|
|
938 break;
|
|
939 }
|
|
940
|
|
941 if (bnum2 == -2 || bnum1 == bnum2)
|
|
942 break;
|
|
943
|
|
944 bnum1 = bnum2; // 2nd pass
|
|
945 }
|
|
946
|
|
947 m = m->next;
|
|
948 }
|
|
949 }
|
|
950 } while (orig_ds != hdr->data_series);
|
|
951
|
|
952 free(block_used);
|
|
953 return 0;
|
|
954 }
|
|
955
|
|
956 /* ----------------------------------------------------------------------
|
|
957 * CRAM slices
|
|
958 */
|
|
959
|
|
960 /*
|
|
961 * Decodes a CRAM (un)mapped slice header block.
|
|
962 * Returns slice header ptr on success
|
|
963 * NULL on failure
|
|
964 */
|
|
965 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
|
|
966 cram_block_slice_hdr *hdr;
|
|
967 char *cp = (char *)b->data;
|
|
968 int i;
|
|
969
|
|
970 if (b->content_type != MAPPED_SLICE &&
|
|
971 b->content_type != UNMAPPED_SLICE)
|
|
972 return NULL;
|
|
973
|
|
974 if (!(hdr = calloc(1, sizeof(*hdr))))
|
|
975 return NULL;
|
|
976
|
|
977 hdr->content_type = b->content_type;
|
|
978
|
|
979 if (b->content_type == MAPPED_SLICE) {
|
|
980 cp += itf8_get(cp, &hdr->ref_seq_id);
|
|
981 cp += itf8_get(cp, &hdr->ref_seq_start);
|
|
982 cp += itf8_get(cp, &hdr->ref_seq_span);
|
|
983 }
|
|
984 cp += itf8_get(cp, &hdr->num_records);
|
|
985 hdr->record_counter = 0;
|
|
986 if (CRAM_MAJOR_VERS(fd->version) == 2) {
|
|
987 int32_t i32;
|
|
988 cp += itf8_get(cp, &i32);
|
|
989 hdr->record_counter = i32;
|
|
990 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
991 cp += ltf8_get(cp, &hdr->record_counter);
|
|
992 }
|
|
993
|
|
994 cp += itf8_get(cp, &hdr->num_blocks);
|
|
995
|
|
996 cp += itf8_get(cp, &hdr->num_content_ids);
|
|
997 hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
|
|
998 if (!hdr->block_content_ids) {
|
|
999 free(hdr);
|
|
1000 return NULL;
|
|
1001 }
|
|
1002
|
|
1003 for (i = 0; i < hdr->num_content_ids; i++) {
|
|
1004 cp += itf8_get(cp, &hdr->block_content_ids[i]);
|
|
1005 }
|
|
1006
|
|
1007 if (b->content_type == MAPPED_SLICE) {
|
|
1008 cp += itf8_get(cp, &hdr->ref_base_id);
|
|
1009 }
|
|
1010
|
|
1011 if (CRAM_MAJOR_VERS(fd->version) != 1) {
|
|
1012 memcpy(hdr->md5, cp, 16);
|
|
1013 } else {
|
|
1014 memset(hdr->md5, 0, 16);
|
|
1015 }
|
|
1016
|
|
1017 return hdr;
|
|
1018 }
|
|
1019
|
|
1020
|
|
1021 #if 0
|
|
1022 /* Returns the number of bits set in val; it the highest bit used */
|
|
1023 static int nbits(int v) {
|
|
1024 static const int MultiplyDeBruijnBitPosition[32] = {
|
|
1025 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
|
|
1026 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
|
|
1027 };
|
|
1028
|
|
1029 v |= v >> 1; // first up to set all bits 1 after the first 1 */
|
|
1030 v |= v >> 2;
|
|
1031 v |= v >> 4;
|
|
1032 v |= v >> 8;
|
|
1033 v |= v >> 16;
|
|
1034
|
|
1035 // DeBruijn magic to find top bit
|
|
1036 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
|
|
1037 }
|
|
1038 #endif
|
|
1039
|
|
1040 #if 0
|
|
1041 static int sort_freqs(const void *vp1, const void *vp2) {
|
|
1042 const int i1 = *(const int *)vp1;
|
|
1043 const int i2 = *(const int *)vp2;
|
|
1044 return i1-i2;
|
|
1045 }
|
|
1046 #endif
|
|
1047
|
|
1048 /* ----------------------------------------------------------------------
|
|
1049 * Primary CRAM sequence decoder
|
|
1050 */
|
|
1051
|
|
1052 /*
|
|
1053 * Internal part of cram_decode_slice().
|
|
1054 * Generates the sequence, quality and cigar components.
|
|
1055 */
|
|
1056 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
|
|
1057 cram_block *blk, cram_record *cr, SAM_hdr *bfd,
|
|
1058 int cf, char *seq, char *qual) {
|
|
1059 int prev_pos = 0, f, r = 0, out_sz = 1;
|
|
1060 int seq_pos = 1;
|
|
1061 int cig_len = 0, ref_pos = cr->apos;
|
|
1062 int32_t fn, i32;
|
|
1063 enum cigar_op cig_op = BAM_CMATCH;
|
|
1064 uint32_t *cigar = s->cigar;
|
|
1065 uint32_t ncigar = s->ncigar;
|
|
1066 uint32_t cigar_alloc = s->cigar_alloc;
|
|
1067 uint32_t nm = 0, md_dist = 0;
|
|
1068 int orig_aux = 0;
|
|
1069 int decode_md = fd->decode_md && s->ref;
|
|
1070 uint32_t ds = c->comp_hdr->data_series;
|
|
1071
|
|
1072 if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
|
1073 memset(qual, 30, cr->len);
|
|
1074 }
|
|
1075
|
|
1076 if (decode_md) {
|
|
1077 orig_aux = BLOCK_SIZE(s->aux_blk);
|
|
1078 BLOCK_APPEND(s->aux_blk, "MDZ", 3);
|
|
1079 }
|
|
1080
|
|
1081 if (ds & CRAM_FN) {
|
|
1082 if (!c->comp_hdr->codecs[DS_FN]) return -1;
|
|
1083 r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
|
|
1084 blk, (char *)&fn, &out_sz);
|
|
1085 } else {
|
|
1086 fn = 0;
|
|
1087 }
|
|
1088
|
|
1089 ref_pos--; // count from 0
|
|
1090 cr->cigar = ncigar;
|
|
1091
|
|
1092 if (!(ds & (CRAM_FC | CRAM_FP)))
|
|
1093 goto skip_cigar;
|
|
1094
|
|
1095 for (f = 0; f < fn; f++) {
|
|
1096 int32_t pos = 0;
|
|
1097 char op;
|
|
1098
|
|
1099 if (ncigar+2 >= cigar_alloc) {
|
|
1100 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
|
|
1101 s->cigar = cigar;
|
|
1102 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
|
|
1103 return -1;
|
|
1104 }
|
|
1105
|
|
1106 if (ds & CRAM_FC) {
|
|
1107 if (!c->comp_hdr->codecs[DS_FC]) return -1;
|
|
1108 r |= c->comp_hdr->codecs[DS_FC]->decode(s,
|
|
1109 c->comp_hdr->codecs[DS_FC],
|
|
1110 blk,
|
|
1111 &op, &out_sz);
|
|
1112 }
|
|
1113
|
|
1114 if (!(ds & CRAM_FP))
|
|
1115 continue;
|
|
1116
|
|
1117 if (!c->comp_hdr->codecs[DS_FP]) return -1;
|
|
1118 r |= c->comp_hdr->codecs[DS_FP]->decode(s,
|
|
1119 c->comp_hdr->codecs[DS_FP],
|
|
1120 blk,
|
|
1121 (char *)&pos, &out_sz);
|
|
1122 pos += prev_pos;
|
|
1123
|
|
1124 if (pos > seq_pos) {
|
|
1125 if (pos > cr->len+1)
|
|
1126 return -1;
|
|
1127
|
|
1128 if (s->ref && cr->ref_id >= 0) {
|
|
1129 if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
|
|
1130 static int whinged = 0;
|
|
1131 if (!whinged)
|
|
1132 fprintf(stderr, "Ref pos outside of ref "
|
|
1133 "sequence boundary\n");
|
|
1134 whinged = 1;
|
|
1135 } else {
|
|
1136 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
|
|
1137 pos - seq_pos);
|
|
1138 }
|
|
1139 }
|
|
1140 #ifdef USE_X
|
|
1141 if (cig_len && cig_op != BAM_CBASE_MATCH) {
|
|
1142 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1143 cig_len = 0;
|
|
1144 }
|
|
1145 cig_op = BAM_CBASE_MATCH;
|
|
1146 #else
|
|
1147 if (cig_len && cig_op != BAM_CMATCH) {
|
|
1148 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1149 cig_len = 0;
|
|
1150 }
|
|
1151 cig_op = BAM_CMATCH;
|
|
1152 #endif
|
|
1153 cig_len += pos - seq_pos;
|
|
1154 ref_pos += pos - seq_pos;
|
|
1155 md_dist += pos - seq_pos;
|
|
1156 seq_pos = pos;
|
|
1157 }
|
|
1158
|
|
1159 prev_pos = pos;
|
|
1160
|
|
1161 if (!(ds & CRAM_FC))
|
|
1162 goto skip_cigar;
|
|
1163
|
|
1164 if (!(ds & CRAM_FC))
|
|
1165 continue;
|
|
1166
|
|
1167 switch(op) {
|
|
1168 case 'S': { // soft clip: IN
|
|
1169 int32_t out_sz2 = 1;
|
|
1170
|
|
1171 if (cig_len) {
|
|
1172 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1173 cig_len = 0;
|
|
1174 }
|
|
1175 if (ds & CRAM_IN) {
|
|
1176 switch (CRAM_MAJOR_VERS(fd->version)) {
|
|
1177 case 1:
|
|
1178 r |= c->comp_hdr->codecs[DS_IN]
|
|
1179 ? c->comp_hdr->codecs[DS_IN]
|
|
1180 ->decode(s, c->comp_hdr->codecs[DS_IN],
|
|
1181 blk, &seq[pos-1], &out_sz2)
|
|
1182 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
|
|
1183 break;
|
|
1184
|
|
1185 case 2:
|
|
1186 default:
|
|
1187 r |= c->comp_hdr->codecs[DS_SC]
|
|
1188 ? c->comp_hdr->codecs[DS_SC]
|
|
1189 ->decode(s, c->comp_hdr->codecs[DS_SC],
|
|
1190 blk, &seq[pos-1], &out_sz2)
|
|
1191 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
|
|
1192 break;
|
|
1193
|
|
1194 // default:
|
|
1195 // r |= c->comp_hdr->codecs[DS_BB]
|
|
1196 // ? c->comp_hdr->codecs[DS_BB]
|
|
1197 // ->decode(s, c->comp_hdr->codecs[DS_BB],
|
|
1198 // blk, &seq[pos-1], &out_sz2)
|
|
1199 // : (seq[pos-1] = 'N', out_sz2 = 1, 0);
|
|
1200 }
|
|
1201 cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
|
|
1202 cig_op = BAM_CSOFT_CLIP;
|
|
1203 seq_pos += out_sz2;
|
|
1204 }
|
|
1205 break;
|
|
1206 }
|
|
1207
|
|
1208 case 'X': { // Substitution; BS
|
|
1209 unsigned char base;
|
|
1210 #ifdef USE_X
|
|
1211 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
|
|
1212 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1213 cig_len = 0;
|
|
1214 }
|
|
1215 if (ds & CRAM_BS) {
|
|
1216 if (!c->comp_hdr->codecs[DS_BS]) return -1;
|
|
1217 r |= c->comp_hdr->codecs[DS_BS]
|
|
1218 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
|
|
1219 (char *)&base, &out_sz);
|
|
1220 seq[pos-1] = 'N'; // FIXME look up BS=base value
|
|
1221 }
|
|
1222 cig_op = BAM_CBASE_MISMATCH;
|
|
1223 #else
|
|
1224 int ref_base;
|
|
1225 if (cig_len && cig_op != BAM_CMATCH) {
|
|
1226 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1227 cig_len = 0;
|
|
1228 }
|
|
1229 if (ds & CRAM_BS) {
|
|
1230 if (!c->comp_hdr->codecs[DS_BS]) return -1;
|
|
1231 r |= c->comp_hdr->codecs[DS_BS]
|
|
1232 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
|
|
1233 (char *)&base, &out_sz);
|
|
1234 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
|
|
1235 seq[pos-1] = 'N';
|
|
1236 } else {
|
|
1237 ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]];
|
|
1238 seq[pos-1] = c->comp_hdr->
|
|
1239 substitution_matrix[ref_base][base];
|
|
1240 if (decode_md) {
|
|
1241 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
|
1242 BLOCK_APPEND_CHAR(s->aux_blk,
|
|
1243 s->ref[ref_pos-s->ref_start +1]);
|
|
1244 md_dist = 0;
|
|
1245 }
|
|
1246 }
|
|
1247 }
|
|
1248 cig_op = BAM_CMATCH;
|
|
1249 #endif
|
|
1250 nm++;
|
|
1251 cig_len++;
|
|
1252 seq_pos++;
|
|
1253 ref_pos++;
|
|
1254 break;
|
|
1255 }
|
|
1256
|
|
1257 case 'D': { // Deletion; DL
|
|
1258 if (cig_len && cig_op != BAM_CDEL) {
|
|
1259 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1260 cig_len = 0;
|
|
1261 }
|
|
1262 if (ds & CRAM_DL) {
|
|
1263 if (!c->comp_hdr->codecs[DS_DL]) return -1;
|
|
1264 r |= c->comp_hdr->codecs[DS_DL]
|
|
1265 ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
|
|
1266 (char *)&i32, &out_sz);
|
|
1267 if (decode_md) {
|
|
1268 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
|
1269 BLOCK_APPEND_CHAR(s->aux_blk, '^');
|
|
1270 BLOCK_APPEND(s->aux_blk,
|
|
1271 &s->ref[ref_pos - s->ref_start +1],
|
|
1272 i32);
|
|
1273 md_dist = 0;
|
|
1274 }
|
|
1275 cig_op = BAM_CDEL;
|
|
1276 cig_len += i32;
|
|
1277 ref_pos += i32;
|
|
1278 nm += i32;
|
|
1279 //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
|
|
1280 }
|
|
1281 break;
|
|
1282 }
|
|
1283
|
|
1284 case 'I': { // Insertion (several bases); IN
|
|
1285 int32_t out_sz2 = 1;
|
|
1286
|
|
1287 if (cig_len && cig_op != BAM_CINS) {
|
|
1288 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1289 cig_len = 0;
|
|
1290 }
|
|
1291
|
|
1292 if (ds & CRAM_IN) {
|
|
1293 if (!c->comp_hdr->codecs[DS_IN]) return -1;
|
|
1294 r |= c->comp_hdr->codecs[DS_IN]
|
|
1295 ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
|
|
1296 &seq[pos-1], &out_sz2);
|
|
1297 cig_op = BAM_CINS;
|
|
1298 cig_len += out_sz2;
|
|
1299 seq_pos += out_sz2;
|
|
1300 nm += out_sz2;
|
|
1301 //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
|
|
1302 }
|
|
1303 break;
|
|
1304 }
|
|
1305
|
|
1306 case 'i': { // Insertion (single base); BA
|
|
1307 if (cig_len && cig_op != BAM_CINS) {
|
|
1308 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1309 cig_len = 0;
|
|
1310 }
|
|
1311 if (ds & CRAM_BA) {
|
|
1312 if (!c->comp_hdr->codecs[DS_BA]) return -1;
|
|
1313 r |= c->comp_hdr->codecs[DS_BA]
|
|
1314 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
|
|
1315 (char *)&seq[pos-1], &out_sz);
|
|
1316 }
|
|
1317 cig_op = BAM_CINS;
|
|
1318 cig_len++;
|
|
1319 seq_pos++;
|
|
1320 nm++;
|
|
1321 break;
|
|
1322 }
|
|
1323
|
|
1324 case 'b': { // Several bases
|
|
1325 int32_t len = 1;
|
|
1326
|
|
1327 if (cig_len && cig_op != BAM_CMATCH) {
|
|
1328 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1329 cig_len = 0;
|
|
1330 }
|
|
1331
|
|
1332 if (ds & CRAM_BB) {
|
|
1333 if (!c->comp_hdr->codecs[DS_BB]) return -1;
|
|
1334 r |= c->comp_hdr->codecs[DS_BB]
|
|
1335 ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
|
|
1336 (char *)&seq[pos-1], &len);
|
|
1337 }
|
|
1338
|
|
1339 cig_op = BAM_CMATCH;
|
|
1340
|
|
1341 cig_len+=len;
|
|
1342 seq_pos+=len;
|
|
1343 ref_pos+=len;
|
|
1344 //prev_pos+=len;
|
|
1345 break;
|
|
1346 }
|
|
1347
|
|
1348 case 'q': { // Several quality values
|
|
1349 int32_t len = 1;
|
|
1350
|
|
1351 if (cig_len && cig_op != BAM_CMATCH) {
|
|
1352 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1353 cig_len = 0;
|
|
1354 }
|
|
1355
|
|
1356 if (ds & CRAM_QQ) {
|
|
1357 if (!c->comp_hdr->codecs[DS_QQ]) return -1;
|
|
1358 r |= c->comp_hdr->codecs[DS_QQ]
|
|
1359 ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
|
|
1360 (char *)&qual[pos-1], &len);
|
|
1361 }
|
|
1362
|
|
1363 cig_op = BAM_CMATCH;
|
|
1364
|
|
1365 cig_len+=len;
|
|
1366 seq_pos+=len;
|
|
1367 ref_pos+=len;
|
|
1368 //prev_pos+=len;
|
|
1369 break;
|
|
1370 }
|
|
1371
|
|
1372 case 'B': { // Read base; BA, QS
|
|
1373 #ifdef USE_X
|
|
1374 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
|
|
1375 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1376 cig_len = 0;
|
|
1377 }
|
|
1378 #else
|
|
1379 if (cig_len && cig_op != BAM_CMATCH) {
|
|
1380 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1381 cig_len = 0;
|
|
1382 }
|
|
1383 #endif
|
|
1384 if (ds & CRAM_BA) {
|
|
1385 if (!c->comp_hdr->codecs[DS_BA]) return -1;
|
|
1386 r |= c->comp_hdr->codecs[DS_BA]
|
|
1387 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
|
|
1388 (char *)&seq[pos-1], &out_sz);
|
|
1389 }
|
|
1390 if (ds & CRAM_QS) {
|
|
1391 if (!c->comp_hdr->codecs[DS_QS]) return -1;
|
|
1392 r |= c->comp_hdr->codecs[DS_QS]
|
|
1393 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
|
|
1394 (char *)&qual[pos-1], &out_sz);
|
|
1395 }
|
|
1396 #ifdef USE_X
|
|
1397 cig_op = BAM_CBASE_MISMATCH;
|
|
1398 #else
|
|
1399 cig_op = BAM_CMATCH;
|
|
1400 #endif
|
|
1401 cig_len++;
|
|
1402 seq_pos++;
|
|
1403 ref_pos++;
|
|
1404 //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
|
|
1405 break;
|
|
1406 }
|
|
1407
|
|
1408 case 'Q': { // Quality score; QS
|
|
1409 if (ds & CRAM_QS) {
|
|
1410 if (!c->comp_hdr->codecs[DS_QS]) return -1;
|
|
1411 r |= c->comp_hdr->codecs[DS_QS]
|
|
1412 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
|
|
1413 (char *)&qual[pos-1], &out_sz);
|
|
1414 //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
|
|
1415 }
|
|
1416 break;
|
|
1417 }
|
|
1418
|
|
1419 case 'H': { // hard clip; HC
|
|
1420 if (cig_len && cig_op != BAM_CHARD_CLIP) {
|
|
1421 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1422 cig_len = 0;
|
|
1423 }
|
|
1424 if (ds & CRAM_HC) {
|
|
1425 if (!c->comp_hdr->codecs[DS_HC]) return -1;
|
|
1426 r |= c->comp_hdr->codecs[DS_HC]
|
|
1427 ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
|
|
1428 (char *)&i32, &out_sz);
|
|
1429 cig_op = BAM_CHARD_CLIP;
|
|
1430 cig_len += i32;
|
|
1431 nm += i32;
|
|
1432 }
|
|
1433 break;
|
|
1434 }
|
|
1435
|
|
1436 case 'P': { // padding; PD
|
|
1437 if (cig_len && cig_op != BAM_CPAD) {
|
|
1438 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1439 cig_len = 0;
|
|
1440 }
|
|
1441 if (ds & CRAM_PD) {
|
|
1442 if (!c->comp_hdr->codecs[DS_PD]) return -1;
|
|
1443 r |= c->comp_hdr->codecs[DS_PD]
|
|
1444 ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
|
|
1445 (char *)&i32, &out_sz);
|
|
1446 cig_op = BAM_CPAD;
|
|
1447 cig_len += i32;
|
|
1448 nm += i32;
|
|
1449 }
|
|
1450 break;
|
|
1451 }
|
|
1452
|
|
1453 case 'N': { // Ref skip; RS
|
|
1454 if (cig_len && cig_op != BAM_CREF_SKIP) {
|
|
1455 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1456 cig_len = 0;
|
|
1457 }
|
|
1458 if (ds & CRAM_RS) {
|
|
1459 if (!c->comp_hdr->codecs[DS_RS]) return -1;
|
|
1460 r |= c->comp_hdr->codecs[DS_RS]
|
|
1461 ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
|
|
1462 (char *)&i32, &out_sz);
|
|
1463 cig_op = BAM_CREF_SKIP;
|
|
1464 cig_len += i32;
|
|
1465 ref_pos += i32;
|
|
1466 nm += i32;
|
|
1467 }
|
|
1468 break;
|
|
1469 }
|
|
1470
|
|
1471 default:
|
|
1472 abort();
|
|
1473 }
|
|
1474 }
|
|
1475
|
|
1476 if (!(ds & CRAM_FC))
|
|
1477 goto skip_cigar;
|
|
1478
|
|
1479 /* An implement match op for any unaccounted for bases */
|
|
1480 if ((ds & CRAM_FN) && cr->len >= seq_pos) {
|
|
1481 if (s->ref) {
|
|
1482 if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
|
|
1483 static int whinged = 0;
|
|
1484 if (!whinged)
|
|
1485 fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
|
|
1486 whinged = 1;
|
|
1487 } else {
|
|
1488 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
|
|
1489 cr->len - seq_pos + 1);
|
|
1490 ref_pos += cr->len - seq_pos + 1;
|
|
1491 md_dist += cr->len - seq_pos + 1;
|
|
1492 }
|
|
1493 }
|
|
1494
|
|
1495 if (ncigar+1 >= cigar_alloc) {
|
|
1496 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
|
|
1497 s->cigar = cigar;
|
|
1498 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
|
|
1499 return -1;
|
|
1500 }
|
|
1501 #ifdef USE_X
|
|
1502 if (cig_len && cig_op != BAM_CBASE_MATCH) {
|
|
1503 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1504 cig_len = 0;
|
|
1505 }
|
|
1506 cig_op = BAM_CBASE_MATCH;
|
|
1507 #else
|
|
1508 if (cig_len && cig_op != BAM_CMATCH) {
|
|
1509 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1510 cig_len = 0;
|
|
1511 }
|
|
1512 cig_op = BAM_CMATCH;
|
|
1513 #endif
|
|
1514 cig_len += cr->len - seq_pos+1;
|
|
1515 }
|
|
1516
|
|
1517 skip_cigar:
|
|
1518
|
|
1519 if ((ds & CRAM_FN) && decode_md) {
|
|
1520 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
|
|
1521 }
|
|
1522
|
|
1523 if (cig_len) {
|
|
1524 if (ncigar >= cigar_alloc) {
|
|
1525 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
|
|
1526 s->cigar = cigar;
|
|
1527 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
|
|
1528 return -1;
|
|
1529 }
|
|
1530
|
|
1531 cigar[ncigar++] = (cig_len<<4) + cig_op;
|
|
1532 }
|
|
1533
|
|
1534 cr->ncigar = ncigar - cr->cigar;
|
|
1535 cr->aend = ref_pos;
|
|
1536
|
|
1537 //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
|
|
1538
|
|
1539 if (ds & CRAM_MQ) {
|
|
1540 if (!c->comp_hdr->codecs[DS_MQ]) return -1;
|
|
1541 r |= c->comp_hdr->codecs[DS_MQ]
|
|
1542 ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
|
|
1543 (char *)&cr->mqual, &out_sz);
|
|
1544 } else {
|
|
1545 cr->mqual = 40;
|
|
1546 }
|
|
1547
|
|
1548 if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
|
1549 int32_t out_sz2 = cr->len;
|
|
1550
|
|
1551 if (ds & CRAM_QS) {
|
|
1552 if (!c->comp_hdr->codecs[DS_QS]) return -1;
|
|
1553 r |= c->comp_hdr->codecs[DS_QS]
|
|
1554 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
|
|
1555 qual, &out_sz2);
|
|
1556 }
|
|
1557 }
|
|
1558
|
|
1559 s->cigar = cigar;
|
|
1560 s->cigar_alloc = cigar_alloc;
|
|
1561 s->ncigar = ncigar;
|
|
1562
|
|
1563 if (decode_md) {
|
|
1564 char buf[7];
|
|
1565 BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
|
|
1566 cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
|
|
1567 buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
|
|
1568 buf[3] = (nm>> 0) & 0xff;
|
|
1569 buf[4] = (nm>> 8) & 0xff;
|
|
1570 buf[5] = (nm>>16) & 0xff;
|
|
1571 buf[6] = (nm>>24) & 0xff;
|
|
1572 BLOCK_APPEND(s->aux_blk, buf, 7);
|
|
1573 cr->aux_size += 7;
|
|
1574 }
|
|
1575
|
|
1576 return r;
|
|
1577 }
|
|
1578
|
|
1579 /*
|
|
1580 * Quick and simple hash lookup for cram_map arrays
|
|
1581 */
|
|
1582 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
|
|
1583 cram_map *m;
|
|
1584
|
|
1585 m = map[CRAM_MAP(key[0],key[1])];
|
|
1586 while (m && m->key != id)
|
|
1587 m= m->next;
|
|
1588
|
|
1589 return m;
|
|
1590 }
|
|
1591
|
|
1592 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
|
|
1593
|
|
1594
|
|
1595 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
|
|
1596 cram_block *blk, cram_record *cr) {
|
|
1597 int i, r = 0, out_sz = 1;
|
|
1598 unsigned char ntags;
|
|
1599
|
|
1600 if (!c->comp_hdr->codecs[DS_TC]) return -1;
|
|
1601 r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
|
|
1602 (char *)&ntags, &out_sz);
|
|
1603 cr->ntags = ntags;
|
|
1604
|
|
1605 //printf("TC=%d\n", cr->ntags);
|
|
1606 cr->aux_size = 0;
|
|
1607 cr->aux = BLOCK_SIZE(s->aux_blk);
|
|
1608
|
|
1609 for (i = 0; i < cr->ntags; i++) {
|
|
1610 int32_t id, out_sz = 1;
|
|
1611 unsigned char tag_data[3];
|
|
1612 cram_map *m;
|
|
1613
|
|
1614 //printf("Tag %d/%d\n", i+1, cr->ntags);
|
|
1615 if (!c->comp_hdr->codecs[DS_TN]) return -1;
|
|
1616 r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
|
|
1617 blk, (char *)&id, &out_sz);
|
|
1618 if (out_sz == 3) {
|
|
1619 tag_data[0] = ((char *)&id)[0];
|
|
1620 tag_data[1] = ((char *)&id)[1];
|
|
1621 tag_data[2] = ((char *)&id)[2];
|
|
1622 } else {
|
|
1623 tag_data[0] = (id>>16) & 0xff;
|
|
1624 tag_data[1] = (id>>8) & 0xff;
|
|
1625 tag_data[2] = id & 0xff;
|
|
1626 }
|
|
1627
|
|
1628 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
|
|
1629 if (!m)
|
|
1630 return -1;
|
|
1631 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
|
|
1632
|
|
1633 if (!m->codec) return -1;
|
|
1634 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
|
|
1635
|
|
1636 cr->aux_size += out_sz + 3;
|
|
1637 }
|
|
1638
|
|
1639 return r;
|
|
1640 }
|
|
1641
|
|
1642 static int cram_decode_aux(cram_container *c, cram_slice *s,
|
|
1643 cram_block *blk, cram_record *cr) {
|
|
1644 int i, r = 0, out_sz = 1;
|
|
1645 int32_t TL = 0;
|
|
1646 unsigned char *TN;
|
|
1647 uint32_t ds = c->comp_hdr->data_series;
|
|
1648
|
|
1649 if (!(ds & (CRAM_TL|CRAM_aux))) {
|
|
1650 cr->aux = 0;
|
|
1651 cr->aux_size = 0;
|
|
1652 return 0;
|
|
1653 }
|
|
1654
|
|
1655 if (!c->comp_hdr->codecs[DS_TL]) return -1;
|
|
1656 r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
|
|
1657 (char *)&TL, &out_sz);
|
|
1658 if (r || TL < 0 || TL >= c->comp_hdr->nTL)
|
|
1659 return -1;
|
|
1660
|
|
1661 TN = c->comp_hdr->TL[TL];
|
|
1662 cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
|
|
1663
|
|
1664 //printf("TC=%d\n", cr->ntags);
|
|
1665 cr->aux_size = 0;
|
|
1666 cr->aux = BLOCK_SIZE(s->aux_blk);
|
|
1667
|
|
1668 if (!(ds & CRAM_aux))
|
|
1669 return 0;
|
|
1670
|
|
1671 for (i = 0; i < cr->ntags; i++) {
|
|
1672 int32_t id, out_sz = 1;
|
|
1673 unsigned char tag_data[3];
|
|
1674 cram_map *m;
|
|
1675
|
|
1676 //printf("Tag %d/%d\n", i+1, cr->ntags);
|
|
1677 tag_data[0] = *TN++;
|
|
1678 tag_data[1] = *TN++;
|
|
1679 tag_data[2] = *TN++;
|
|
1680 id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
|
|
1681
|
|
1682 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
|
|
1683 if (!m)
|
|
1684 return -1;
|
|
1685 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
|
|
1686
|
|
1687 if (!m->codec) return -1;
|
|
1688 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
|
|
1689 cr->aux_size += out_sz + 3;
|
|
1690 }
|
|
1691
|
|
1692 return r;
|
|
1693 }
|
|
1694
|
|
1695 /* Resolve mate pair cross-references between recs within this slice */
|
|
1696 static void cram_decode_slice_xref(cram_slice *s, int required_fields) {
|
|
1697 int rec;
|
|
1698
|
|
1699 if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
|
|
1700 for (rec = 0; rec < s->hdr->num_records; rec++) {
|
|
1701 cram_record *cr = &s->crecs[rec];
|
|
1702
|
|
1703 cr->tlen = 0;
|
|
1704 cr->mate_pos = 0;
|
|
1705 cr->mate_ref_id = -1;
|
|
1706 }
|
|
1707
|
|
1708 return;
|
|
1709 }
|
|
1710
|
|
1711 for (rec = 0; rec < s->hdr->num_records; rec++) {
|
|
1712 cram_record *cr = &s->crecs[rec];
|
|
1713
|
|
1714 if (cr->mate_line >= 0) {
|
|
1715 if (cr->mate_line < s->hdr->num_records) {
|
|
1716 /*
|
|
1717 * On the first read, loop through computing lengths.
|
|
1718 * It's not perfect as we have one slice per reference so we
|
|
1719 * cannot detect when TLEN should be zero due to seqs that
|
|
1720 * map to multiple references.
|
|
1721 *
|
|
1722 * We also cannot set tlen correct when it spans a slice for
|
|
1723 * other reasons. This may make tlen too small. Should we
|
|
1724 * fix this by forcing TLEN to be stored verbatim in such cases?
|
|
1725 *
|
|
1726 * Or do we just admit defeat and output 0 for tlen? It's the
|
|
1727 * safe option...
|
|
1728 */
|
|
1729 if (cr->tlen == INT_MIN) {
|
|
1730 int id1 = rec, id2 = rec;
|
|
1731 int aleft = cr->apos, aright = cr->aend;
|
|
1732 int tlen;
|
|
1733 int ref = cr->ref_id;
|
|
1734
|
|
1735 // number of segments starting at the same point.
|
|
1736 int left_cnt = 0;
|
|
1737
|
|
1738 do {
|
|
1739 if (aleft > s->crecs[id2].apos)
|
|
1740 aleft = s->crecs[id2].apos, left_cnt = 1;
|
|
1741 else if (aleft == s->crecs[id2].apos)
|
|
1742 left_cnt++;
|
|
1743 if (aright < s->crecs[id2].aend)
|
|
1744 aright = s->crecs[id2].aend;
|
|
1745 if (s->crecs[id2].mate_line == -1) {
|
|
1746 s->crecs[id2].mate_line = rec;
|
|
1747 break;
|
|
1748 }
|
|
1749 assert(s->crecs[id2].mate_line > id2);
|
|
1750 id2 = s->crecs[id2].mate_line;
|
|
1751
|
|
1752 if (s->crecs[id2].ref_id != ref)
|
|
1753 ref = -1;
|
|
1754 } while (id2 != id1);
|
|
1755
|
|
1756 if (ref != -1) {
|
|
1757 tlen = aright - aleft + 1;
|
|
1758 id1 = id2 = rec;
|
|
1759
|
|
1760 /*
|
|
1761 * When we have two seqs with identical start and
|
|
1762 * end coordinates, set +/- tlen based on 1st/last
|
|
1763 * bit flags instead, as a tie breaker.
|
|
1764 */
|
|
1765 if (s->crecs[id2].apos == aleft) {
|
|
1766 if (left_cnt == 1 ||
|
|
1767 (s->crecs[id2].flags & BAM_FREAD1))
|
|
1768 s->crecs[id2].tlen = tlen;
|
|
1769 else
|
|
1770 s->crecs[id2].tlen = -tlen;
|
|
1771 } else {
|
|
1772 s->crecs[id2].tlen = -tlen;
|
|
1773 }
|
|
1774
|
|
1775 id2 = s->crecs[id2].mate_line;
|
|
1776 while (id2 != id1) {
|
|
1777 if (s->crecs[id2].apos == aleft) {
|
|
1778 if (left_cnt == 1 ||
|
|
1779 (s->crecs[id2].flags & BAM_FREAD1))
|
|
1780 s->crecs[id2].tlen = tlen;
|
|
1781 else
|
|
1782 s->crecs[id2].tlen = -tlen;
|
|
1783 } else {
|
|
1784 s->crecs[id2].tlen = -tlen;
|
|
1785 }
|
|
1786 id2 = s->crecs[id2].mate_line;
|
|
1787 }
|
|
1788 } else {
|
|
1789 id1 = id2 = rec;
|
|
1790
|
|
1791 s->crecs[id2].tlen = 0;
|
|
1792 id2 = s->crecs[id2].mate_line;
|
|
1793 while (id2 != id1) {
|
|
1794 s->crecs[id2].tlen = 0;
|
|
1795 id2 = s->crecs[id2].mate_line;
|
|
1796 }
|
|
1797 }
|
|
1798 }
|
|
1799
|
|
1800 cr->mate_pos = s->crecs[cr->mate_line].apos;
|
|
1801 cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
|
|
1802
|
|
1803 // paired
|
|
1804 cr->flags |= BAM_FPAIRED;
|
|
1805
|
|
1806 // set mate unmapped if needed
|
|
1807 if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
|
|
1808 cr->flags |= BAM_FMUNMAP;
|
|
1809 cr->tlen = 0;
|
|
1810 }
|
|
1811 if (cr->flags & BAM_FUNMAP) {
|
|
1812 cr->tlen = 0;
|
|
1813 }
|
|
1814
|
|
1815 // set mate reversed if needed
|
|
1816 if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
|
|
1817 cr->flags |= BAM_FMREVERSE;
|
|
1818 } else {
|
|
1819 fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
|
|
1820 cr->mate_line, s->hdr->num_records-1);
|
|
1821 }
|
|
1822
|
|
1823 /* FIXME: construct read names here too if needed */
|
|
1824 } else {
|
|
1825 if (cr->mate_flags & CRAM_M_REVERSE) {
|
|
1826 cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
|
|
1827 }
|
|
1828 if (cr->mate_flags & CRAM_M_UNMAP) {
|
|
1829 cr->flags |= BAM_FMUNMAP;
|
|
1830 //cr->mate_ref_id = -1;
|
|
1831 }
|
|
1832 if (!(cr->flags & BAM_FPAIRED))
|
|
1833 cr->mate_ref_id = -1;
|
|
1834 }
|
|
1835
|
|
1836 if (cr->tlen == INT_MIN)
|
|
1837 cr->tlen = 0; // Just incase
|
|
1838 }
|
|
1839 }
|
|
1840
|
|
1841 static char *md5_print(unsigned char *md5, char *out) {
|
|
1842 int i;
|
|
1843 for (i = 0; i < 16; i++) {
|
|
1844 out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
|
|
1845 out[i*2+1] = "0123456789abcdef"[md5[i]&15];
|
|
1846 }
|
|
1847 out[32] = 0;
|
|
1848
|
|
1849 return out;
|
|
1850 }
|
|
1851
|
|
1852 /*
|
|
1853 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
|
|
1854 * Returns 0 on success
|
|
1855 * -1 on failure
|
|
1856 */
|
|
1857 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
|
|
1858 SAM_hdr *bfd) {
|
|
1859 cram_block *blk = s->block[0];
|
|
1860 int32_t bf, ref_id;
|
|
1861 unsigned char cf;
|
|
1862 int out_sz, r = 0;
|
|
1863 int rec;
|
|
1864 char *seq = NULL, *qual = NULL;
|
|
1865 int unknown_rg = -1;
|
|
1866 int embed_ref;
|
|
1867 char **refs = NULL;
|
|
1868 uint32_t ds;
|
|
1869
|
|
1870 if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
|
|
1871 return -1;
|
|
1872
|
|
1873 ds = c->comp_hdr->data_series;
|
|
1874
|
|
1875 blk->bit = 7; // MSB first
|
|
1876
|
|
1877 /* Look for unknown RG, added as last by Java CRAM? */
|
|
1878 if (bfd->nrg > 0 &&
|
|
1879 !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
|
|
1880 unknown_rg = bfd->nrg-1;
|
|
1881
|
|
1882 if (blk->content_type != CORE)
|
|
1883 return -1;
|
|
1884
|
|
1885 if (s->crecs)
|
|
1886 free(s->crecs);
|
|
1887 if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
|
|
1888 return -1;
|
|
1889
|
|
1890 ref_id = s->hdr->ref_seq_id;
|
|
1891 embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
|
|
1892
|
|
1893 if (ref_id >= 0) {
|
|
1894 if (embed_ref) {
|
|
1895 cram_block *b;
|
|
1896 if (s->hdr->ref_base_id < 0) {
|
|
1897 fprintf(stderr, "No reference specified and "
|
|
1898 "no embedded reference is available.\n");
|
|
1899 return -1;
|
|
1900 }
|
|
1901 if (!s->block_by_id ||
|
|
1902 !(b = s->block_by_id[s->hdr->ref_base_id]))
|
|
1903 return -1;
|
|
1904 cram_uncompress_block(b);
|
|
1905 s->ref = (char *)BLOCK_DATA(b);
|
|
1906 s->ref_start = s->hdr->ref_seq_start;
|
|
1907 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
|
|
1908 } else if (!fd->no_ref) {
|
|
1909 //// Avoid Java cramtools bug by loading entire reference seq
|
|
1910 //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
|
|
1911 //s->ref_start = 1;
|
|
1912
|
|
1913 if (fd->required_fields & SAM_SEQ)
|
|
1914 s->ref =
|
|
1915 cram_get_ref(fd, s->hdr->ref_seq_id,
|
|
1916 s->hdr->ref_seq_start,
|
|
1917 s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
|
|
1918 s->ref_start = s->hdr->ref_seq_start;
|
|
1919 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
|
|
1920
|
|
1921 /* Sanity check */
|
|
1922 if (s->ref_start < 0) {
|
|
1923 fprintf(stderr, "Slice starts before base 1.\n");
|
|
1924 s->ref_start = 0;
|
|
1925 }
|
|
1926 pthread_mutex_lock(&fd->ref_lock);
|
|
1927 pthread_mutex_lock(&fd->refs->lock);
|
|
1928 if ((fd->required_fields & SAM_SEQ) &&
|
|
1929 s->ref_end > fd->refs->ref_id[ref_id]->length) {
|
|
1930 fprintf(stderr, "Slice ends beyond reference end.\n");
|
|
1931 s->ref_end = fd->refs->ref_id[ref_id]->length;
|
|
1932 }
|
|
1933 pthread_mutex_unlock(&fd->refs->lock);
|
|
1934 pthread_mutex_unlock(&fd->ref_lock);
|
|
1935 }
|
|
1936 }
|
|
1937
|
|
1938 if ((fd->required_fields & SAM_SEQ) &&
|
|
1939 s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
|
|
1940 fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
|
|
1941 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
|
|
1942 s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
|
|
1943 return -1;
|
|
1944 }
|
|
1945
|
|
1946 if (CRAM_MAJOR_VERS(fd->version) != 1
|
|
1947 && (fd->required_fields & SAM_SEQ)
|
|
1948 && s->hdr->ref_seq_id >= 0
|
|
1949 && !fd->ignore_md5
|
|
1950 && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
|
|
1951 MD5_CTX md5;
|
|
1952 unsigned char digest[16];
|
|
1953
|
|
1954 if (s->ref && s->hdr->ref_seq_id >= 0) {
|
|
1955 int start, len;
|
|
1956
|
|
1957 if (s->hdr->ref_seq_start >= s->ref_start) {
|
|
1958 start = s->hdr->ref_seq_start - s->ref_start;
|
|
1959 } else {
|
|
1960 fprintf(stderr, "Slice starts before base 1.\n");
|
|
1961 start = 0;
|
|
1962 }
|
|
1963
|
|
1964 if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
|
|
1965 len = s->hdr->ref_seq_span;
|
|
1966 } else {
|
|
1967 fprintf(stderr, "Slice ends beyond reference end.\n");
|
|
1968 len = s->ref_end - s->ref_start + 1;
|
|
1969 }
|
|
1970
|
|
1971 MD5_Init(&md5);
|
|
1972 if (start + len > s->ref_end - s->ref_start + 1)
|
|
1973 len = s->ref_end - s->ref_start + 1 - start;
|
|
1974 if (len >= 0)
|
|
1975 MD5_Update(&md5, s->ref + start, len);
|
|
1976 MD5_Final(digest, &md5);
|
|
1977 } else if (!s->ref && s->hdr->ref_base_id >= 0) {
|
|
1978 cram_block *b;
|
|
1979 if (s->block_by_id && (b = s->block_by_id[s->hdr->ref_base_id])) {
|
|
1980 MD5_Init(&md5);
|
|
1981 MD5_Update(&md5, b->data, b->uncomp_size);
|
|
1982 MD5_Final(digest, &md5);
|
|
1983 }
|
|
1984 }
|
|
1985
|
|
1986 if ((!s->ref && s->hdr->ref_base_id < 0)
|
|
1987 || memcmp(digest, s->hdr->md5, 16) != 0) {
|
|
1988 char M[33];
|
|
1989 fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
|
|
1990 "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
|
|
1991 fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
|
|
1992 fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
|
|
1993 return -1;
|
|
1994 }
|
|
1995 }
|
|
1996
|
|
1997 if (ref_id == -2) {
|
|
1998 pthread_mutex_lock(&fd->ref_lock);
|
|
1999 pthread_mutex_lock(&fd->refs->lock);
|
|
2000 refs = calloc(fd->refs->nref, sizeof(char *));
|
|
2001 pthread_mutex_unlock(&fd->refs->lock);
|
|
2002 pthread_mutex_unlock(&fd->ref_lock);
|
|
2003 if (!refs)
|
|
2004 return -1;
|
|
2005 }
|
|
2006
|
|
2007 for (rec = 0; rec < s->hdr->num_records; rec++) {
|
|
2008 cram_record *cr = &s->crecs[rec];
|
|
2009
|
|
2010 //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
|
|
2011
|
|
2012 cr->s = s;
|
|
2013
|
|
2014 out_sz = 1; /* decode 1 item */
|
|
2015 if (ds & CRAM_BF) {
|
|
2016 if (!c->comp_hdr->codecs[DS_BF]) return -1;
|
|
2017 r |= c->comp_hdr->codecs[DS_BF]
|
|
2018 ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
|
|
2019 (char *)&bf, &out_sz);
|
|
2020 if (bf < 0 ||
|
|
2021 bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
|
|
2022 return -1;
|
|
2023 bf = fd->bam_flag_swap[bf];
|
|
2024 cr->flags = bf;
|
|
2025 } else {
|
|
2026 cr->flags = bf = 0x4; // unmapped
|
|
2027 }
|
|
2028
|
|
2029 if (ds & CRAM_CF) {
|
|
2030 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
2031 /* CF is byte in 1.0, int32 in 2.0 */
|
|
2032 if (!c->comp_hdr->codecs[DS_CF]) return -1;
|
|
2033 r |= c->comp_hdr->codecs[DS_CF]
|
|
2034 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
|
|
2035 (char *)&cf, &out_sz);
|
|
2036 cr->cram_flags = cf;
|
|
2037 } else {
|
|
2038 if (!c->comp_hdr->codecs[DS_CF]) return -1;
|
|
2039 r |= c->comp_hdr->codecs[DS_CF]
|
|
2040 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
|
|
2041 (char *)&cr->cram_flags,
|
|
2042 &out_sz);
|
|
2043 cf = cr->cram_flags;
|
|
2044 }
|
|
2045 }
|
|
2046
|
|
2047 if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
|
|
2048 if (ds & CRAM_RI) {
|
|
2049 if (!c->comp_hdr->codecs[DS_RI]) return -1;
|
|
2050 r |= c->comp_hdr->codecs[DS_RI]
|
|
2051 ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
|
|
2052 (char *)&cr->ref_id, &out_sz);
|
|
2053 if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
|
|
2054 && cr->ref_id >= 0) {
|
|
2055 if (!fd->no_ref) {
|
|
2056 if (!refs[cr->ref_id])
|
|
2057 refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
|
|
2058 1, 0);
|
|
2059 s->ref = refs[cr->ref_id];
|
|
2060 }
|
|
2061 s->ref_start = 1;
|
|
2062 pthread_mutex_lock(&fd->ref_lock);
|
|
2063 pthread_mutex_lock(&fd->refs->lock);
|
|
2064 s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
|
|
2065 pthread_mutex_unlock(&fd->refs->lock);
|
|
2066 pthread_mutex_unlock(&fd->ref_lock);
|
|
2067 }
|
|
2068 } else {
|
|
2069 cr->ref_id = 0;
|
|
2070 }
|
|
2071 } else {
|
|
2072 cr->ref_id = ref_id; // Forced constant in CRAM 1.0
|
|
2073 }
|
|
2074
|
|
2075
|
|
2076 if (ds & CRAM_RL) {
|
|
2077 if (!c->comp_hdr->codecs[DS_RL]) return -1;
|
|
2078 r |= c->comp_hdr->codecs[DS_RL]
|
|
2079 ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
|
|
2080 (char *)&cr->len, &out_sz);
|
|
2081 }
|
|
2082
|
|
2083 if (ds & CRAM_AP) {
|
|
2084 if (!c->comp_hdr->codecs[DS_AP]) return -1;
|
|
2085 r |= c->comp_hdr->codecs[DS_AP]
|
|
2086 ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
|
|
2087 (char *)&cr->apos, &out_sz);
|
|
2088 if (c->comp_hdr->AP_delta)
|
|
2089 cr->apos += s->last_apos;
|
|
2090 s->last_apos= cr->apos;
|
|
2091 } else {
|
|
2092 cr->apos = c->ref_seq_start;
|
|
2093 }
|
|
2094
|
|
2095 if (ds & CRAM_RG) {
|
|
2096 if (!c->comp_hdr->codecs[DS_RG]) return -1;
|
|
2097 r |= c->comp_hdr->codecs[DS_RG]
|
|
2098 ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
|
|
2099 (char *)&cr->rg, &out_sz);
|
|
2100 if (cr->rg == unknown_rg)
|
|
2101 cr->rg = -1;
|
|
2102 } else {
|
|
2103 cr->rg = -1;
|
|
2104 }
|
|
2105
|
|
2106 cr->name_len = 0;
|
|
2107
|
|
2108 if (c->comp_hdr->read_names_included) {
|
|
2109 int32_t out_sz2 = 1;
|
|
2110
|
|
2111 // Read directly into name cram_block
|
|
2112 cr->name = BLOCK_SIZE(s->name_blk);
|
|
2113 if (ds & CRAM_RN) {
|
|
2114 if (!c->comp_hdr->codecs[DS_RN]) return -1;
|
|
2115 r |= c->comp_hdr->codecs[DS_RN]
|
|
2116 ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
|
|
2117 (char *)s->name_blk, &out_sz2);
|
|
2118 cr->name_len = out_sz2;
|
|
2119 }
|
|
2120 }
|
|
2121
|
|
2122 cr->mate_pos = 0;
|
|
2123 cr->mate_line = -1;
|
|
2124 cr->mate_ref_id = -1;
|
|
2125 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
|
|
2126 if (ds & CRAM_MF) {
|
|
2127 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
2128 /* MF is byte in 1.0, int32 in 2.0 */
|
|
2129 unsigned char mf;
|
|
2130 if (!c->comp_hdr->codecs[DS_MF]) return -1;
|
|
2131 r |= c->comp_hdr->codecs[DS_MF]
|
|
2132 ->decode(s, c->comp_hdr->codecs[DS_MF],
|
|
2133 blk, (char *)&mf, &out_sz);
|
|
2134 cr->mate_flags = mf;
|
|
2135 } else {
|
|
2136 if (!c->comp_hdr->codecs[DS_MF]) return -1;
|
|
2137 r |= c->comp_hdr->codecs[DS_MF]
|
|
2138 ->decode(s, c->comp_hdr->codecs[DS_MF],
|
|
2139 blk,
|
|
2140 (char *)&cr->mate_flags,
|
|
2141 &out_sz);
|
|
2142 }
|
|
2143 } else {
|
|
2144 cr->mate_flags = 0;
|
|
2145 }
|
|
2146
|
|
2147 if (!c->comp_hdr->read_names_included) {
|
|
2148 int32_t out_sz2 = 1;
|
|
2149
|
|
2150 // Read directly into name cram_block
|
|
2151 cr->name = BLOCK_SIZE(s->name_blk);
|
|
2152 if (ds & CRAM_RN) {
|
|
2153 if (!c->comp_hdr->codecs[DS_RN]) return -1;
|
|
2154 r |= c->comp_hdr->codecs[DS_RN]
|
|
2155 ->decode(s, c->comp_hdr->codecs[DS_RN],
|
|
2156 blk, (char *)s->name_blk,
|
|
2157 &out_sz2);
|
|
2158 cr->name_len = out_sz2;
|
|
2159 }
|
|
2160 }
|
|
2161
|
|
2162 if (ds & CRAM_NS) {
|
|
2163 if (!c->comp_hdr->codecs[DS_NS]) return -1;
|
|
2164 r |= c->comp_hdr->codecs[DS_NS]
|
|
2165 ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
|
|
2166 (char *)&cr->mate_ref_id, &out_sz);
|
|
2167 }
|
|
2168
|
|
2169 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
|
|
2170 // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
|
|
2171 // /* Paired, but unmapped */
|
|
2172 // cr->flags |= BAM_FMUNMAP;
|
|
2173 // }
|
|
2174
|
|
2175 if (ds & CRAM_NP) {
|
|
2176 if (!c->comp_hdr->codecs[DS_NP]) return -1;
|
|
2177 r |= c->comp_hdr->codecs[DS_NP]
|
|
2178 ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
|
|
2179 (char *)&cr->mate_pos, &out_sz);
|
|
2180 }
|
|
2181
|
|
2182 if (ds & CRAM_TS) {
|
|
2183 if (!c->comp_hdr->codecs[DS_TS]) return -1;
|
|
2184 r |= c->comp_hdr->codecs[DS_TS]
|
|
2185 ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
|
|
2186 (char *)&cr->tlen, &out_sz);
|
|
2187 } else {
|
|
2188 cr->tlen = INT_MIN;
|
|
2189 }
|
|
2190 } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
|
|
2191 if (ds & CRAM_NF) {
|
|
2192 if (!c->comp_hdr->codecs[DS_NF]) return -1;
|
|
2193 r |= c->comp_hdr->codecs[DS_NF]
|
|
2194 ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
|
|
2195 (char *)&cr->mate_line, &out_sz);
|
|
2196 cr->mate_line += rec + 1;
|
|
2197
|
|
2198 //cr->name_len = sprintf(name, "%d", name_id++);
|
|
2199 //cr->name = DSTRING_LEN(name_ds);
|
|
2200 //dstring_nappend(name_ds, name, cr->name_len);
|
|
2201
|
|
2202 cr->mate_ref_id = -1;
|
|
2203 cr->tlen = INT_MIN;
|
|
2204 cr->mate_pos = 0;
|
|
2205 } else {
|
|
2206 cr->mate_flags = 0;
|
|
2207 cr->tlen = INT_MIN;
|
|
2208 }
|
|
2209 } else {
|
|
2210 cr->mate_flags = 0;
|
|
2211 cr->tlen = INT_MIN;
|
|
2212 }
|
|
2213 /*
|
|
2214 else if (!name[0]) {
|
|
2215 //name[0] = '?'; name[1] = 0;
|
|
2216 //cr->name_len = 1;
|
|
2217 //cr->name= DSTRING_LEN(s->name_ds);
|
|
2218 //dstring_nappend(s->name_ds, "?", 1);
|
|
2219
|
|
2220 cr->mate_ref_id = -1;
|
|
2221 cr->tlen = 0;
|
|
2222 cr->mate_pos = 0;
|
|
2223 }
|
|
2224 */
|
|
2225
|
|
2226 /* Auxiliary tags */
|
|
2227 if (CRAM_MAJOR_VERS(fd->version) == 1)
|
|
2228 r |= cram_decode_aux_1_0(c, s, blk, cr);
|
|
2229 else
|
|
2230 r |= cram_decode_aux(c, s, blk, cr);
|
|
2231
|
|
2232 /* Fake up dynamic string growth and appending */
|
|
2233 if (ds & CRAM_RL) {
|
|
2234 cr->seq = BLOCK_SIZE(s->seqs_blk);
|
|
2235 BLOCK_GROW(s->seqs_blk, cr->len);
|
|
2236 seq = (char *)BLOCK_END(s->seqs_blk);
|
|
2237 BLOCK_SIZE(s->seqs_blk) += cr->len;
|
|
2238
|
|
2239 if (!seq)
|
|
2240 return -1;
|
|
2241
|
|
2242 cr->qual = BLOCK_SIZE(s->qual_blk);
|
|
2243 BLOCK_GROW(s->qual_blk, cr->len);
|
|
2244 qual = (char *)BLOCK_END(s->qual_blk);
|
|
2245 BLOCK_SIZE(s->qual_blk) += cr->len;
|
|
2246
|
|
2247 if (!s->ref)
|
|
2248 memset(seq, '=', cr->len);
|
|
2249 }
|
|
2250
|
|
2251 if (!(bf & BAM_FUNMAP)) {
|
|
2252 /* Decode sequence and generate CIGAR */
|
|
2253 if (ds & (CRAM_SEQ | CRAM_MQ)) {
|
|
2254 r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual);
|
|
2255 } else {
|
|
2256 cr->cigar = 0;
|
|
2257 cr->ncigar = 0;
|
|
2258 cr->aend = cr->apos;
|
|
2259 cr->mqual = 0;
|
|
2260 }
|
|
2261 } else {
|
|
2262 int out_sz2 = cr->len;
|
|
2263
|
|
2264 //puts("Unmapped");
|
|
2265 cr->cigar = 0;
|
|
2266 cr->ncigar = 0;
|
|
2267 cr->aend = cr->apos;
|
|
2268 cr->mqual = 0;
|
|
2269
|
|
2270 if (ds & CRAM_BA) {
|
|
2271 if (!c->comp_hdr->codecs[DS_BA]) return -1;
|
|
2272 r |= c->comp_hdr->codecs[DS_BA]
|
|
2273 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
|
|
2274 (char *)seq, &out_sz2);
|
|
2275 }
|
|
2276
|
|
2277 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
|
|
2278 out_sz2 = cr->len;
|
|
2279 if (ds & CRAM_QS) {
|
|
2280 if (!c->comp_hdr->codecs[DS_QS]) return -1;
|
|
2281 r |= c->comp_hdr->codecs[DS_QS]
|
|
2282 ->decode(s, c->comp_hdr->codecs[DS_QS],
|
|
2283 blk, qual, &out_sz2);
|
|
2284 }
|
|
2285 } else {
|
|
2286 if (ds & CRAM_RL)
|
|
2287 memset(qual, 30, cr->len);
|
|
2288 }
|
|
2289 }
|
|
2290 }
|
|
2291
|
|
2292 pthread_mutex_lock(&fd->ref_lock);
|
|
2293 if (refs) {
|
|
2294 int i;
|
|
2295 for (i = 0; i < fd->refs->nref; i++) {
|
|
2296 if (refs[i])
|
|
2297 cram_ref_decr(fd->refs, i);
|
|
2298 }
|
|
2299 free(refs);
|
|
2300 } else if (ref_id >= 0 && s->ref != fd->ref_free) {
|
|
2301 cram_ref_decr(fd->refs, ref_id);
|
|
2302 }
|
|
2303 pthread_mutex_unlock(&fd->ref_lock);
|
|
2304
|
|
2305 /* Resolve mate pair cross-references between recs within this slice */
|
|
2306 cram_decode_slice_xref(s, fd->required_fields);
|
|
2307
|
|
2308 return r;
|
|
2309 }
|
|
2310
|
|
2311 typedef struct {
|
|
2312 cram_fd *fd;
|
|
2313 cram_container *c;
|
|
2314 cram_slice *s;
|
|
2315 SAM_hdr *h;
|
|
2316 int exit_code;
|
|
2317 } cram_decode_job;
|
|
2318
|
|
2319 void *cram_decode_slice_thread(void *arg) {
|
|
2320 cram_decode_job *j = (cram_decode_job *)arg;
|
|
2321
|
|
2322 j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
|
|
2323
|
|
2324 return j;
|
|
2325 }
|
|
2326
|
|
2327 /*
|
|
2328 * Spawn a multi-threaded version of cram_decode_slice().
|
|
2329 */
|
|
2330 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
|
|
2331 SAM_hdr *bfd) {
|
|
2332 cram_decode_job *j;
|
|
2333 int nonblock;
|
|
2334
|
|
2335 if (!fd->pool)
|
|
2336 return cram_decode_slice(fd, c, s, bfd);
|
|
2337
|
|
2338 if (!(j = malloc(sizeof(*j))))
|
|
2339 return -1;
|
|
2340
|
|
2341 j->fd = fd;
|
|
2342 j->c = c;
|
|
2343 j->s = s;
|
|
2344 j->h = bfd;
|
|
2345
|
|
2346 nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
|
|
2347
|
|
2348 if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
|
|
2349 j, nonblock)) {
|
|
2350 /* Would block */
|
|
2351 fd->job_pending = j;
|
|
2352 } else {
|
|
2353 fd->job_pending = NULL;
|
|
2354 }
|
|
2355
|
|
2356 // flush too
|
|
2357 return 0;
|
|
2358 }
|
|
2359
|
|
2360
|
|
2361 /* ----------------------------------------------------------------------
|
|
2362 * CRAM sequence iterators.
|
|
2363 */
|
|
2364
|
|
2365 /*
|
|
2366 * Converts a cram in-memory record into a bam in-memory record. We
|
|
2367 * pass a pointer to a bam_seq_t pointer along with the a pointer to
|
|
2368 * the allocated size. These can initially be pointers to NULL and zero.
|
|
2369 *
|
|
2370 * This function will reallocate the bam buffer as required and update
|
|
2371 * (*bam)->alloc accordingly, allowing it to be used within a loop
|
|
2372 * efficiently without needing to allocate new bam objects over and
|
|
2373 * over again.
|
|
2374 *
|
|
2375 * Returns the used size of the bam record on success
|
|
2376 * -1 on failure.
|
|
2377 */
|
|
2378 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
|
|
2379 cram_record *cr, int rec, bam_seq_t **bam) {
|
|
2380 int bam_idx, rg_len;
|
|
2381 char name_a[1024], *name;
|
|
2382 int name_len;
|
|
2383 char *aux, *aux_orig;
|
|
2384 char *seq, *qual;
|
|
2385
|
|
2386 /* Assign names if not explicitly set */
|
|
2387 if (fd->required_fields & SAM_QNAME) {
|
|
2388 if (cr->name_len) {
|
|
2389 name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
|
|
2390 name_len = cr->name_len;
|
|
2391 } else {
|
|
2392 name = name_a;
|
|
2393 name_len = strlen(fd->prefix);
|
|
2394 memcpy(name, fd->prefix, name_len);
|
|
2395 name += name_len;
|
|
2396 *name++ = ':';
|
|
2397 if (cr->mate_line >= 0 && cr->mate_line < rec)
|
|
2398 name = (char *)append_uint64((unsigned char *)name,
|
|
2399 s->hdr->record_counter +
|
|
2400 cr->mate_line + 1);
|
|
2401 else
|
|
2402 name = (char *)append_uint64((unsigned char *)name,
|
|
2403 s->hdr->record_counter +
|
|
2404 rec + 1);
|
|
2405 name_len = name - name_a;
|
|
2406 name = name_a;
|
|
2407 }
|
|
2408 } else {
|
|
2409 name = "?";
|
|
2410 name_len = 1;
|
|
2411 }
|
|
2412
|
|
2413 /* Generate BAM record */
|
|
2414 if (cr->rg < -1 || cr->rg >= bfd->nrg)
|
|
2415 return -1;
|
|
2416 rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
|
|
2417
|
|
2418 if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
|
|
2419 if (!BLOCK_DATA(s->seqs_blk))
|
|
2420 return -1;
|
|
2421 seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
|
|
2422 } else {
|
|
2423 seq = "*";
|
|
2424 cr->len = 1;
|
|
2425 }
|
|
2426
|
|
2427
|
|
2428 if (fd->required_fields & SAM_QUAL) {
|
|
2429 if (!BLOCK_DATA(s->qual_blk))
|
|
2430 return -1;
|
|
2431 qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
|
|
2432 } else {
|
|
2433 qual = NULL;
|
|
2434 }
|
|
2435
|
|
2436 bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
|
|
2437 name, name_len,
|
|
2438 cr->flags,
|
|
2439 cr->ref_id,
|
|
2440 cr->apos,
|
|
2441 cr->aend,
|
|
2442 cr->mqual,
|
|
2443 cr->ncigar, &s->cigar[cr->cigar],
|
|
2444 cr->mate_ref_id,
|
|
2445 cr->mate_pos,
|
|
2446 cr->tlen,
|
|
2447 cr->len,
|
|
2448 seq,
|
|
2449 qual);
|
|
2450 if (bam_idx == -1)
|
|
2451 return -1;
|
|
2452
|
|
2453 aux = aux_orig = (char *)bam_aux(*bam);
|
|
2454
|
|
2455 /* Auxiliary strings */
|
|
2456 if (cr->aux_size != 0) {
|
|
2457 memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
|
|
2458 aux += cr->aux_size;
|
|
2459 }
|
|
2460
|
|
2461 /* RG:Z: */
|
|
2462 if (cr->rg != -1) {
|
|
2463 int len = bfd->rg[cr->rg].name_len;
|
|
2464 *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
|
|
2465 memcpy(aux, bfd->rg[cr->rg].name, len);
|
|
2466 aux += len;
|
|
2467 *aux++ = 0;
|
|
2468 }
|
|
2469
|
|
2470 return bam_idx + (aux - aux_orig);
|
|
2471 }
|
|
2472
|
|
2473 /*
|
|
2474 * Here be dragons! The multi-threading code in this is crufty beyond belief.
|
|
2475 */
|
|
2476 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
|
|
2477 cram_container *c;
|
|
2478 cram_slice *s = NULL;
|
|
2479
|
|
2480 if (!(c = fd->ctr)) {
|
|
2481 // Load first container.
|
|
2482 do {
|
|
2483 if (!(c = fd->ctr = cram_read_container(fd)))
|
|
2484 return NULL;
|
|
2485 } while (c->length == 0);
|
|
2486
|
|
2487 /*
|
|
2488 * The first container may be a result of a sub-range query.
|
|
2489 * In which case it may still not be the optimal starting point
|
|
2490 * due to skipped containers/slices in the index.
|
|
2491 */
|
|
2492 if (fd->range.refid != -2) {
|
|
2493 while (c->ref_seq_id != -2 &&
|
|
2494 (c->ref_seq_id < fd->range.refid ||
|
|
2495 c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
|
|
2496 if (0 != cram_seek(fd, c->length, SEEK_CUR))
|
|
2497 return NULL;
|
|
2498 cram_free_container(fd->ctr);
|
|
2499 do {
|
|
2500 if (!(c = fd->ctr = cram_read_container(fd)))
|
|
2501 return NULL;
|
|
2502 } while (c->length == 0);
|
|
2503 }
|
|
2504
|
|
2505 if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
|
|
2506 return NULL;
|
|
2507 }
|
|
2508
|
|
2509 if (!(c->comp_hdr_block = cram_read_block(fd)))
|
|
2510 return NULL;
|
|
2511 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
|
|
2512 return NULL;
|
|
2513
|
|
2514 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
|
|
2515 if (!c->comp_hdr)
|
|
2516 return NULL;
|
|
2517 if (!c->comp_hdr->AP_delta) {
|
|
2518 pthread_mutex_lock(&fd->ref_lock);
|
|
2519 fd->unsorted = 1;
|
|
2520 pthread_mutex_unlock(&fd->ref_lock);
|
|
2521 }
|
|
2522 }
|
|
2523
|
|
2524 if ((s = c->slice)) {
|
|
2525 c->slice = NULL;
|
|
2526 cram_free_slice(s);
|
|
2527 s = NULL;
|
|
2528 }
|
|
2529
|
|
2530 if (c->curr_slice == c->max_slice) {
|
|
2531 cram_free_container(c);
|
|
2532 c = NULL;
|
|
2533 }
|
|
2534
|
|
2535 /* Sorry this is so contorted! */
|
|
2536 for (;;) {
|
|
2537 if (fd->job_pending) {
|
|
2538 cram_decode_job *j = (cram_decode_job *)fd->job_pending;
|
|
2539 c = j->c;
|
|
2540 s = j->s;
|
|
2541 free(fd->job_pending);
|
|
2542 fd->job_pending = NULL;
|
|
2543 } else if (!fd->ooc) {
|
|
2544 empty_container:
|
|
2545 if (!c || c->curr_slice == c->max_slice) {
|
|
2546 // new container
|
|
2547 do {
|
|
2548 if (!(c = fd->ctr = cram_read_container(fd))) {
|
|
2549 if (fd->pool) {
|
|
2550 fd->ooc = 1;
|
|
2551 break;
|
|
2552 }
|
|
2553
|
|
2554 return NULL;
|
|
2555 }
|
|
2556 } while (c->length == 0);
|
|
2557 if (fd->ooc)
|
|
2558 break;
|
|
2559
|
|
2560 /* Skip containers not yet spanning our range */
|
|
2561 if (fd->range.refid != -2 && c->ref_seq_id != -2) {
|
|
2562 fd->required_fields |= SAM_POS;
|
|
2563
|
|
2564 if (c->ref_seq_id != fd->range.refid) {
|
|
2565 cram_free_container(c);
|
|
2566 fd->ctr = NULL;
|
|
2567 fd->ooc = 1;
|
|
2568 fd->eof = 1;
|
|
2569 break;
|
|
2570 }
|
|
2571
|
|
2572 if (c->ref_seq_start > fd->range.end) {
|
|
2573 cram_free_container(c);
|
|
2574 fd->ctr = NULL;
|
|
2575 fd->ooc = 1;
|
|
2576 fd->eof = 1;
|
|
2577 break;
|
|
2578 }
|
|
2579
|
|
2580 if (c->ref_seq_start + c->ref_seq_span-1 <
|
|
2581 fd->range.start) {
|
|
2582 c->curr_rec = c->max_rec;
|
|
2583 c->curr_slice = c->max_slice;
|
|
2584 cram_seek(fd, c->length, SEEK_CUR);
|
|
2585 cram_free_container(c);
|
|
2586 c = NULL;
|
|
2587 continue;
|
|
2588 }
|
|
2589 }
|
|
2590
|
|
2591 if (!(c->comp_hdr_block = cram_read_block(fd)))
|
|
2592 return NULL;
|
|
2593 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
|
|
2594 return NULL;
|
|
2595
|
|
2596 c->comp_hdr =
|
|
2597 cram_decode_compression_header(fd, c->comp_hdr_block);
|
|
2598 if (!c->comp_hdr)
|
|
2599 return NULL;
|
|
2600
|
|
2601 if (!c->comp_hdr->AP_delta) {
|
|
2602 pthread_mutex_lock(&fd->ref_lock);
|
|
2603 fd->unsorted = 1;
|
|
2604 pthread_mutex_unlock(&fd->ref_lock);
|
|
2605 }
|
|
2606 }
|
|
2607
|
|
2608 if (c->num_records == 0) {
|
|
2609 cram_free_container(c); c = NULL;
|
|
2610 goto empty_container;
|
|
2611 }
|
|
2612
|
|
2613
|
|
2614 if (!(s = c->slice = cram_read_slice(fd)))
|
|
2615 return NULL;
|
|
2616 c->curr_slice++;
|
|
2617 c->curr_rec = 0;
|
|
2618 c->max_rec = s->hdr->num_records;
|
|
2619
|
|
2620 s->last_apos = s->hdr->ref_seq_start;
|
|
2621
|
|
2622 /* Skip slices not yet spanning our range */
|
|
2623 if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
|
|
2624 if (s->hdr->ref_seq_id != fd->range.refid) {
|
|
2625 fd->eof = 1;
|
|
2626 cram_free_slice(s);
|
|
2627 c->slice = NULL;
|
|
2628 return NULL;
|
|
2629 }
|
|
2630
|
|
2631 if (s->hdr->ref_seq_start > fd->range.end) {
|
|
2632 fd->eof = 1;
|
|
2633 cram_free_slice(s);
|
|
2634 c->slice = NULL;
|
|
2635 return NULL;
|
|
2636 }
|
|
2637
|
|
2638 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
|
|
2639 fd->range.start) {
|
|
2640 cram_free_slice(s);
|
|
2641 c->slice = NULL;
|
|
2642 cram_free_container(c);
|
|
2643 c = NULL;
|
|
2644 continue;
|
|
2645 }
|
|
2646 }
|
|
2647 }
|
|
2648
|
|
2649 /* Test decoding of 1st seq */
|
|
2650 if (!c || !s)
|
|
2651 break;
|
|
2652
|
|
2653 if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
|
|
2654 // if (cram_decode_slice(fd, c, s, fd->header) != 0) {
|
|
2655 fprintf(stderr, "Failure to decode slice\n");
|
|
2656 cram_free_slice(s);
|
|
2657 c->slice = NULL;
|
|
2658 return NULL;
|
|
2659 }
|
|
2660
|
|
2661 if (!fd->pool || fd->job_pending)
|
|
2662 break;
|
|
2663
|
|
2664 // Push it a bit far, to qsize in queue rather than pending arrival,
|
|
2665 // as cram tends to be a bit bursty in decode timings.
|
|
2666 if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
|
|
2667 break;
|
|
2668 }
|
|
2669
|
|
2670 if (fd->pool) {
|
|
2671 t_pool_result *res;
|
|
2672 cram_decode_job *j;
|
|
2673
|
|
2674 // fprintf(stderr, "Thread pool len = %d, %d\n",
|
|
2675 // t_pool_results_queue_len(fd->rqueue),
|
|
2676 // t_pool_results_queue_sz(fd->rqueue));
|
|
2677
|
|
2678 if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
|
|
2679 return NULL;
|
|
2680
|
|
2681 res = t_pool_next_result_wait(fd->rqueue);
|
|
2682
|
|
2683 if (!res || !res->data) {
|
|
2684 fprintf(stderr, "t_pool_next_result failure\n");
|
|
2685 return NULL;
|
|
2686 }
|
|
2687
|
|
2688 j = (cram_decode_job *)res->data;
|
|
2689 c = j->c;
|
|
2690 s = j->s;
|
|
2691
|
|
2692 fd->ctr = c;
|
|
2693
|
|
2694 t_pool_delete_result(res, 1);
|
|
2695 }
|
|
2696
|
|
2697 *cp = c;
|
|
2698 return s;
|
|
2699 }
|
|
2700
|
|
2701 /*
|
|
2702 * Read the next cram record and return it.
|
|
2703 * Note that to decode cram_record the caller will need to look up some data
|
|
2704 * in the current slice, pointed to by fd->ctr->slice. This is valid until
|
|
2705 * the next call to cram_get_seq (which may invalidate it).
|
|
2706 *
|
|
2707 * Returns record pointer on success (do not free)
|
|
2708 * NULL on failure
|
|
2709 */
|
|
2710 cram_record *cram_get_seq(cram_fd *fd) {
|
|
2711 cram_container *c;
|
|
2712 cram_slice *s;
|
|
2713
|
|
2714 for (;;) {
|
|
2715 c = fd->ctr;
|
|
2716 if (c && c->slice && c->curr_rec < c->max_rec) {
|
|
2717 s = c->slice;
|
|
2718 } else {
|
|
2719 if (!(s = cram_next_slice(fd, &c)))
|
|
2720 return NULL;
|
|
2721 }
|
|
2722
|
|
2723 if (fd->range.refid != -2) {
|
|
2724 if (s->crecs[c->curr_rec].ref_id < fd->range.refid) {
|
|
2725 c->curr_rec++;
|
|
2726 continue;
|
|
2727 }
|
|
2728
|
|
2729 if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
|
|
2730 fd->eof = 1;
|
|
2731 cram_free_slice(s);
|
|
2732 c->slice = NULL;
|
|
2733 return NULL;
|
|
2734 }
|
|
2735
|
|
2736 if (s->crecs[c->curr_rec].apos > fd->range.end) {
|
|
2737 fd->eof = 1;
|
|
2738 cram_free_slice(s);
|
|
2739 c->slice = NULL;
|
|
2740 return NULL;
|
|
2741 }
|
|
2742
|
|
2743 if (s->crecs[c->curr_rec].aend < fd->range.start) {
|
|
2744 c->curr_rec++;
|
|
2745 continue;
|
|
2746 }
|
|
2747 }
|
|
2748
|
|
2749 break;
|
|
2750 }
|
|
2751
|
|
2752 fd->ctr = c;
|
|
2753 c->slice = s;
|
|
2754 return &s->crecs[c->curr_rec++];
|
|
2755 }
|
|
2756
|
|
2757 /*
|
|
2758 * Read the next cram record and convert it to a bam_seq_t struct.
|
|
2759 *
|
|
2760 * Returns 0 on success
|
|
2761 * -1 on EOF or failure (check fd->err)
|
|
2762 */
|
|
2763 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
|
|
2764 cram_record *cr;
|
|
2765 cram_container *c;
|
|
2766 cram_slice *s;
|
|
2767
|
|
2768 if (!(cr = cram_get_seq(fd)))
|
|
2769 return -1;
|
|
2770
|
|
2771 c = fd->ctr;
|
|
2772 s = c->slice;
|
|
2773
|
|
2774 return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
|
|
2775 }
|