Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_decode.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2012-2014 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 /* | |
32 * - In-memory decoding of CRAM data structures. | |
33 * - Iterator for reading CRAM record by record. | |
34 */ | |
35 | |
36 #ifdef HAVE_CONFIG_H | |
37 #include "io_lib_config.h" | |
38 #endif | |
39 | |
40 #include <stdio.h> | |
41 #include <errno.h> | |
42 #include <assert.h> | |
43 #include <stdlib.h> | |
44 #include <string.h> | |
45 #include <zlib.h> | |
46 #include <sys/types.h> | |
47 #include <sys/stat.h> | |
48 #include <math.h> | |
49 #include <ctype.h> | |
50 | |
51 #include "cram/cram.h" | |
52 #include "cram/os.h" | |
53 #include "cram/md5.h" | |
54 | |
55 //Whether CIGAR has just M or uses = and X to indicate match and mismatch | |
56 //#define USE_X | |
57 | |
58 /* ---------------------------------------------------------------------- | |
59 * CRAM compression headers | |
60 */ | |
61 | |
62 /* | |
63 * Decodes the Tag Dictionary record in the preservation map | |
64 * Updates the cram compression header. | |
65 * | |
66 * Returns number of bytes decoded on success | |
67 * -1 on failure | |
68 */ | |
69 int cram_decode_TD(char *cp, cram_block_compression_hdr *h) { | |
70 char *op = cp; | |
71 unsigned char *dat; | |
72 cram_block *b; | |
73 int32_t blk_size; | |
74 int nTL, i, sz; | |
75 | |
76 if (!(b = cram_new_block(0, 0))) | |
77 return -1; | |
78 h->TD_blk = b; | |
79 | |
80 /* Decode */ | |
81 cp += itf8_get(cp, &blk_size); | |
82 if (!blk_size) { | |
83 h->nTL = 0; | |
84 h->TL = NULL; | |
85 cram_free_block(b); | |
86 return cp - op; | |
87 } | |
88 | |
89 BLOCK_APPEND(b, cp, blk_size); | |
90 cp += blk_size; | |
91 sz = cp - op; | |
92 | |
93 // Force nul termination if missing | |
94 if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1]) | |
95 BLOCK_APPEND_CHAR(b, '\0'); | |
96 | |
97 /* Set up TL lookup table */ | |
98 dat = BLOCK_DATA(b); | |
99 | |
100 // Count | |
101 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) { | |
102 nTL++; | |
103 while (dat[i]) | |
104 i++; | |
105 } | |
106 | |
107 // Copy | |
108 h->nTL = nTL; | |
109 if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) | |
110 return -1; | |
111 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) { | |
112 h->TL[nTL++] = &dat[i]; | |
113 while (dat[i]) | |
114 i++; | |
115 } | |
116 | |
117 return sz; | |
118 } | |
119 | |
120 /* | |
121 * Decodes a CRAM block compression header. | |
122 * Returns header ptr on success | |
123 * NULL on failure | |
124 */ | |
125 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, | |
126 cram_block *b) { | |
127 char *cp, *cp_copy; | |
128 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); | |
129 int i; | |
130 int32_t map_size, map_count; | |
131 | |
132 if (!hdr) | |
133 return NULL; | |
134 | |
135 if (b->method != RAW) { | |
136 if (cram_uncompress_block(b)) { | |
137 free(hdr); | |
138 return NULL; | |
139 } | |
140 } | |
141 | |
142 cp = (char *)b->data; | |
143 | |
144 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
145 cp += itf8_get(cp, &hdr->ref_seq_id); | |
146 cp += itf8_get(cp, &hdr->ref_seq_start); | |
147 cp += itf8_get(cp, &hdr->ref_seq_span); | |
148 cp += itf8_get(cp, &hdr->num_records); | |
149 cp += itf8_get(cp, &hdr->num_landmarks); | |
150 if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) { | |
151 free(hdr); | |
152 return NULL; | |
153 } | |
154 for (i = 0; i < hdr->num_landmarks; i++) { | |
155 cp += itf8_get(cp, &hdr->landmark[i]); | |
156 } | |
157 } | |
158 | |
159 hdr->preservation_map = kh_init(map); | |
160 | |
161 memset(hdr->rec_encoding_map, 0, | |
162 CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0])); | |
163 memset(hdr->tag_encoding_map, 0, | |
164 CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0])); | |
165 | |
166 if (!hdr->preservation_map) { | |
167 cram_free_compression_header(hdr); | |
168 return NULL; | |
169 } | |
170 | |
171 /* Initialise defaults for preservation map */ | |
172 hdr->mapped_qs_included = 0; | |
173 hdr->unmapped_qs_included = 0; | |
174 hdr->unmapped_placed = 0; | |
175 hdr->qs_included = 0; | |
176 hdr->read_names_included = 0; | |
177 hdr->AP_delta = 1; | |
178 memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); | |
179 | |
180 /* Preservation map */ | |
181 cp += itf8_get(cp, &map_size); cp_copy = cp; | |
182 cp += itf8_get(cp, &map_count); | |
183 for (i = 0; i < map_count; i++) { | |
184 pmap_t hd; | |
185 khint_t k; | |
186 int r; | |
187 | |
188 cp += 2; | |
189 switch(CRAM_KEY(cp[-2],cp[-1])) { | |
190 case CRAM_KEY('M','I'): | |
191 hd.i = *cp++; | |
192 k = kh_put(map, hdr->preservation_map, "MI", &r); | |
193 if (-1 == r) { | |
194 cram_free_compression_header(hdr); | |
195 return NULL; | |
196 } | |
197 | |
198 kh_val(hdr->preservation_map, k) = hd; | |
199 hdr->mapped_qs_included = hd.i; | |
200 break; | |
201 | |
202 case CRAM_KEY('U','I'): | |
203 hd.i = *cp++; | |
204 k = kh_put(map, hdr->preservation_map, "UI", &r); | |
205 if (-1 == r) { | |
206 cram_free_compression_header(hdr); | |
207 return NULL; | |
208 } | |
209 | |
210 kh_val(hdr->preservation_map, k) = hd; | |
211 hdr->unmapped_qs_included = hd.i; | |
212 break; | |
213 | |
214 case CRAM_KEY('P','I'): | |
215 hd.i = *cp++; | |
216 k = kh_put(map, hdr->preservation_map, "PI", &r); | |
217 if (-1 == r) { | |
218 cram_free_compression_header(hdr); | |
219 return NULL; | |
220 } | |
221 | |
222 kh_val(hdr->preservation_map, k) = hd; | |
223 hdr->unmapped_placed = hd.i; | |
224 break; | |
225 | |
226 case CRAM_KEY('R','N'): | |
227 hd.i = *cp++; | |
228 k = kh_put(map, hdr->preservation_map, "RN", &r); | |
229 if (-1 == r) { | |
230 cram_free_compression_header(hdr); | |
231 return NULL; | |
232 } | |
233 | |
234 kh_val(hdr->preservation_map, k) = hd; | |
235 hdr->read_names_included = hd.i; | |
236 break; | |
237 | |
238 case CRAM_KEY('A','P'): | |
239 hd.i = *cp++; | |
240 k = kh_put(map, hdr->preservation_map, "AP", &r); | |
241 if (-1 == r) { | |
242 cram_free_compression_header(hdr); | |
243 return NULL; | |
244 } | |
245 | |
246 kh_val(hdr->preservation_map, k) = hd; | |
247 hdr->AP_delta = hd.i; | |
248 break; | |
249 | |
250 case CRAM_KEY('R','R'): | |
251 hd.i = *cp++; | |
252 k = kh_put(map, hdr->preservation_map, "RR", &r); | |
253 if (-1 == r) { | |
254 cram_free_compression_header(hdr); | |
255 return NULL; | |
256 } | |
257 | |
258 kh_val(hdr->preservation_map, k) = hd; | |
259 fd->no_ref = !hd.i; | |
260 break; | |
261 | |
262 case CRAM_KEY('S','M'): | |
263 hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C'; | |
264 hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G'; | |
265 hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T'; | |
266 hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N'; | |
267 | |
268 hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A'; | |
269 hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G'; | |
270 hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T'; | |
271 hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N'; | |
272 | |
273 hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A'; | |
274 hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C'; | |
275 hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T'; | |
276 hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N'; | |
277 | |
278 hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A'; | |
279 hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C'; | |
280 hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G'; | |
281 hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N'; | |
282 | |
283 hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A'; | |
284 hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C'; | |
285 hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G'; | |
286 hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T'; | |
287 | |
288 hd.p = cp; | |
289 cp += 5; | |
290 | |
291 k = kh_put(map, hdr->preservation_map, "SM", &r); | |
292 if (-1 == r) { | |
293 cram_free_compression_header(hdr); | |
294 return NULL; | |
295 } | |
296 kh_val(hdr->preservation_map, k) = hd; | |
297 break; | |
298 | |
299 case CRAM_KEY('T','D'): { | |
300 int sz = cram_decode_TD(cp, hdr); // tag dictionary | |
301 if (sz < 0) { | |
302 cram_free_compression_header(hdr); | |
303 return NULL; | |
304 } | |
305 | |
306 hd.p = cp; | |
307 cp += sz; | |
308 | |
309 k = kh_put(map, hdr->preservation_map, "TD", &r); | |
310 if (-1 == r) { | |
311 cram_free_compression_header(hdr); | |
312 return NULL; | |
313 } | |
314 kh_val(hdr->preservation_map, k) = hd; | |
315 break; | |
316 } | |
317 | |
318 default: | |
319 fprintf(stderr, "Unrecognised preservation map key %c%c\n", | |
320 cp[-2], cp[-1]); | |
321 // guess byte; | |
322 cp++; | |
323 break; | |
324 } | |
325 } | |
326 if (cp - cp_copy != map_size) { | |
327 cram_free_compression_header(hdr); | |
328 return NULL; | |
329 } | |
330 | |
331 /* Record encoding map */ | |
332 cp += itf8_get(cp, &map_size); cp_copy = cp; | |
333 cp += itf8_get(cp, &map_count); | |
334 for (i = 0; i < map_count; i++) { | |
335 char *key = cp; | |
336 int32_t encoding; | |
337 int32_t size; | |
338 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc | |
339 | |
340 if (!m) { | |
341 cram_free_compression_header(hdr); | |
342 return NULL; | |
343 } | |
344 | |
345 cp += 2; | |
346 cp += itf8_get(cp, &encoding); | |
347 cp += itf8_get(cp, &size); | |
348 | |
349 // Fill out cram_map purely for cram_dump to dump out. | |
350 m->key = (key[0]<<8)|key[1]; | |
351 m->encoding = encoding; | |
352 m->size = size; | |
353 m->offset = cp - (char *)b->data; | |
354 m->codec = NULL; | |
355 | |
356 if (m->encoding == E_NULL) | |
357 continue; | |
358 | |
359 //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key); | |
360 | |
361 /* | |
362 * For CRAM1.0 CF and BF are Byte and not Int. | |
363 * Practically speaking it makes no difference unless we have a | |
364 * 1.0 format file that stores these in EXTERNAL as only then | |
365 * does Byte vs Int matter. | |
366 * | |
367 * Neither this C code nor Java reference implementations did this, | |
368 * so we gloss over it and treat them as int. | |
369 */ | |
370 | |
371 if (key[0] == 'B' && key[1] == 'F') { | |
372 if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size, | |
373 E_INT, | |
374 fd->version))) { | |
375 cram_free_compression_header(hdr); | |
376 return NULL; | |
377 } | |
378 } else if (key[0] == 'C' && key[1] == 'F') { | |
379 if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size, | |
380 E_INT, | |
381 fd->version))) { | |
382 cram_free_compression_header(hdr); | |
383 return NULL; | |
384 } | |
385 } else if (key[0] == 'R' && key[1] == 'I') { | |
386 if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size, | |
387 E_INT, | |
388 fd->version))) { | |
389 cram_free_compression_header(hdr); | |
390 return NULL; | |
391 } | |
392 } else if (key[0] == 'R' && key[1] == 'L') { | |
393 if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size, | |
394 E_INT, | |
395 fd->version))) { | |
396 cram_free_compression_header(hdr); | |
397 return NULL; | |
398 } | |
399 } else if (key[0] == 'A' && key[1] == 'P') { | |
400 if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size, | |
401 E_INT, | |
402 fd->version))) { | |
403 cram_free_compression_header(hdr); | |
404 return NULL; | |
405 } | |
406 } else if (key[0] == 'R' && key[1] == 'G') { | |
407 if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size, | |
408 E_INT, | |
409 fd->version))) { | |
410 cram_free_compression_header(hdr); | |
411 return NULL; | |
412 } | |
413 } else if (key[0] == 'M' && key[1] == 'F') { | |
414 if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size, | |
415 E_INT, | |
416 fd->version))) { | |
417 cram_free_compression_header(hdr); | |
418 return NULL; | |
419 } | |
420 } else if (key[0] == 'N' && key[1] == 'S') { | |
421 if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size, | |
422 E_INT, | |
423 fd->version))) { | |
424 cram_free_compression_header(hdr); | |
425 return NULL; | |
426 } | |
427 } else if (key[0] == 'N' && key[1] == 'P') { | |
428 if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size, | |
429 E_INT, | |
430 fd->version))) { | |
431 cram_free_compression_header(hdr); | |
432 return NULL; | |
433 } | |
434 } else if (key[0] == 'T' && key[1] == 'S') { | |
435 if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size, | |
436 E_INT, | |
437 fd->version))) { | |
438 cram_free_compression_header(hdr); | |
439 return NULL; | |
440 } | |
441 } else if (key[0] == 'N' && key[1] == 'F') { | |
442 if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size, | |
443 E_INT, | |
444 fd->version))) { | |
445 cram_free_compression_header(hdr); | |
446 return NULL; | |
447 } | |
448 } else if (key[0] == 'T' && key[1] == 'C') { | |
449 if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size, | |
450 E_BYTE, | |
451 fd->version))) { | |
452 cram_free_compression_header(hdr); | |
453 return NULL; | |
454 } | |
455 } else if (key[0] == 'T' && key[1] == 'N') { | |
456 if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size, | |
457 E_INT, | |
458 fd->version))) { | |
459 cram_free_compression_header(hdr); | |
460 return NULL; | |
461 } | |
462 } else if (key[0] == 'F' && key[1] == 'N') { | |
463 if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size, | |
464 E_INT, | |
465 fd->version))) { | |
466 cram_free_compression_header(hdr); | |
467 return NULL; | |
468 } | |
469 } else if (key[0] == 'F' && key[1] == 'C') { | |
470 if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size, | |
471 E_BYTE, | |
472 fd->version))) { | |
473 cram_free_compression_header(hdr); | |
474 return NULL; | |
475 } | |
476 } else if (key[0] == 'F' && key[1] == 'P') { | |
477 if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size, | |
478 E_INT, | |
479 fd->version))) { | |
480 cram_free_compression_header(hdr); | |
481 return NULL; | |
482 } | |
483 } else if (key[0] == 'B' && key[1] == 'S') { | |
484 if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size, | |
485 E_BYTE, | |
486 fd->version))) { | |
487 cram_free_compression_header(hdr); | |
488 return NULL; | |
489 } | |
490 } else if (key[0] == 'I' && key[1] == 'N') { | |
491 if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size, | |
492 E_BYTE_ARRAY, | |
493 fd->version))) { | |
494 cram_free_compression_header(hdr); | |
495 return NULL; | |
496 } | |
497 } else if (key[0] == 'S' && key[1] == 'C') { | |
498 if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size, | |
499 E_BYTE_ARRAY, | |
500 fd->version))) { | |
501 cram_free_compression_header(hdr); | |
502 return NULL; | |
503 } | |
504 } else if (key[0] == 'D' && key[1] == 'L') { | |
505 if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size, | |
506 E_INT, | |
507 fd->version))) { | |
508 cram_free_compression_header(hdr); | |
509 return NULL; | |
510 } | |
511 } else if (key[0] == 'B' && key[1] == 'A') { | |
512 if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size, | |
513 E_BYTE, | |
514 fd->version))) { | |
515 cram_free_compression_header(hdr); | |
516 return NULL; | |
517 } | |
518 } else if (key[0] == 'B' && key[1] == 'B') { | |
519 if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size, | |
520 E_BYTE_ARRAY, | |
521 fd->version))) { | |
522 cram_free_compression_header(hdr); | |
523 return NULL; | |
524 } | |
525 } else if (key[0] == 'R' && key[1] == 'S') { | |
526 if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size, | |
527 E_INT, | |
528 fd->version))) { | |
529 cram_free_compression_header(hdr); | |
530 return NULL; | |
531 } | |
532 } else if (key[0] == 'P' && key[1] == 'D') { | |
533 if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size, | |
534 E_INT, | |
535 fd->version))) { | |
536 cram_free_compression_header(hdr); | |
537 return NULL; | |
538 } | |
539 } else if (key[0] == 'H' && key[1] == 'C') { | |
540 if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size, | |
541 E_INT, | |
542 fd->version))) { | |
543 cram_free_compression_header(hdr); | |
544 return NULL; | |
545 } | |
546 } else if (key[0] == 'M' && key[1] == 'Q') { | |
547 if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size, | |
548 E_INT, | |
549 fd->version))) { | |
550 cram_free_compression_header(hdr); | |
551 return NULL; | |
552 } | |
553 } else if (key[0] == 'R' && key[1] == 'N') { | |
554 if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size, | |
555 E_BYTE_ARRAY_BLOCK, | |
556 fd->version))) { | |
557 cram_free_compression_header(hdr); | |
558 return NULL; | |
559 } | |
560 } else if (key[0] == 'Q' && key[1] == 'S') { | |
561 if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size, | |
562 E_BYTE, | |
563 fd->version))) { | |
564 cram_free_compression_header(hdr); | |
565 return NULL; | |
566 } | |
567 } else if (key[0] == 'Q' && key[1] == 'Q') { | |
568 if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size, | |
569 E_BYTE_ARRAY, | |
570 fd->version))) { | |
571 cram_free_compression_header(hdr); | |
572 return NULL; | |
573 } | |
574 } else if (key[0] == 'T' && key[1] == 'L') { | |
575 if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size, | |
576 E_INT, | |
577 fd->version))) { | |
578 cram_free_compression_header(hdr); | |
579 return NULL; | |
580 } | |
581 } else if (key[0] == 'T' && key[1] == 'M') { | |
582 } else if (key[0] == 'T' && key[1] == 'V') { | |
583 } else | |
584 fprintf(stderr, "Unrecognised key: %.2s\n", key); | |
585 | |
586 cp += size; | |
587 | |
588 m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])]; | |
589 hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m; | |
590 } | |
591 if (cp - cp_copy != map_size) { | |
592 cram_free_compression_header(hdr); | |
593 return NULL; | |
594 } | |
595 | |
596 /* Tag encoding map */ | |
597 cp += itf8_get(cp, &map_size); cp_copy = cp; | |
598 cp += itf8_get(cp, &map_count); | |
599 for (i = 0; i < map_count; i++) { | |
600 int32_t encoding; | |
601 int32_t size; | |
602 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc | |
603 char *key = cp+1; | |
604 | |
605 if (!m) { | |
606 cram_free_compression_header(hdr); | |
607 return NULL; | |
608 } | |
609 | |
610 m->key = (key[0]<<16)|(key[1]<<8)|key[2]; | |
611 | |
612 cp += 4; // Strictly ITF8, but this suffices | |
613 cp += itf8_get(cp, &encoding); | |
614 cp += itf8_get(cp, &size); | |
615 | |
616 m->encoding = encoding; | |
617 m->size = size; | |
618 m->offset = cp - (char *)b->data; | |
619 if (!(m->codec = cram_decoder_init(encoding, cp, size, | |
620 E_BYTE_ARRAY_BLOCK, fd->version))) { | |
621 cram_free_compression_header(hdr); | |
622 free(m); | |
623 return NULL; | |
624 } | |
625 | |
626 cp += size; | |
627 | |
628 m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])]; | |
629 hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m; | |
630 } | |
631 if (cp - cp_copy != map_size) { | |
632 cram_free_compression_header(hdr); | |
633 return NULL; | |
634 } | |
635 | |
636 return hdr; | |
637 } | |
638 | |
639 /* | |
640 * Note we also need to scan through the record encoding map to | |
641 * see which data series share the same block, either external or | |
642 * CORE. For example if we need the BF data series but MQ and CF | |
643 * are also encoded in the same block then we need to add those in | |
644 * as a dependency in order to correctly decode BF. | |
645 * | |
646 * Returns 0 on success | |
647 * -1 on failure | |
648 */ | |
649 int cram_dependent_data_series(cram_fd *fd, | |
650 cram_block_compression_hdr *hdr, | |
651 cram_slice *s) { | |
652 int *block_used; | |
653 int core_used = 0; | |
654 int i; | |
655 static int i_to_id[] = { | |
656 DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS, | |
657 DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN, | |
658 DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD, | |
659 DS_HC, DS_SC, DS_BB, DS_QQ, | |
660 }; | |
661 uint32_t orig_ds; | |
662 | |
663 /* | |
664 * Set the data_series bit field based on fd->required_fields | |
665 * contents. | |
666 */ | |
667 if (fd->required_fields && fd->required_fields != INT_MAX) { | |
668 hdr->data_series = 0; | |
669 | |
670 if (fd->required_fields & SAM_QNAME) | |
671 hdr->data_series |= CRAM_RN; | |
672 | |
673 if (fd->required_fields & SAM_FLAG) | |
674 hdr->data_series |= CRAM_BF; | |
675 | |
676 if (fd->required_fields & SAM_RNAME) | |
677 hdr->data_series |= CRAM_RI | CRAM_BF; | |
678 | |
679 if (fd->required_fields & SAM_POS) | |
680 hdr->data_series |= CRAM_AP | CRAM_BF; | |
681 | |
682 if (fd->required_fields & SAM_MAPQ) | |
683 hdr->data_series |= CRAM_MQ; | |
684 | |
685 if (fd->required_fields & SAM_CIGAR) | |
686 hdr->data_series |= CRAM_CIGAR; | |
687 | |
688 if (fd->required_fields & SAM_RNEXT) | |
689 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF; | |
690 | |
691 if (fd->required_fields & SAM_PNEXT) | |
692 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF; | |
693 | |
694 if (fd->required_fields & SAM_TLEN) | |
695 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS | | |
696 CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR; | |
697 | |
698 if (fd->required_fields & SAM_SEQ) | |
699 hdr->data_series |= CRAM_SEQ; | |
700 | |
701 if (!(fd->required_fields & SAM_AUX)) | |
702 // No easy way to get MD/NM without other tags at present | |
703 fd->decode_md = 0; | |
704 | |
705 if (fd->required_fields & SAM_QUAL) | |
706 hdr->data_series |= CRAM_SEQ; | |
707 | |
708 if (fd->required_fields & SAM_AUX) | |
709 hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux; | |
710 | |
711 if (fd->required_fields & SAM_RGAUX) | |
712 hdr->data_series |= CRAM_RG | CRAM_BF; | |
713 | |
714 // Always uncompress CORE block | |
715 if (cram_uncompress_block(s->block[0])) | |
716 return -1; | |
717 } else { | |
718 hdr->data_series = CRAM_ALL; | |
719 | |
720 for (i = 0; i < s->hdr->num_blocks; i++) { | |
721 if (cram_uncompress_block(s->block[i])) | |
722 return -1; | |
723 } | |
724 | |
725 return 0; | |
726 } | |
727 | |
728 block_used = calloc(s->hdr->num_blocks+1, sizeof(int)); | |
729 if (!block_used) | |
730 return -1; | |
731 | |
732 do { | |
733 /* | |
734 * Also set data_series based on code prerequisites. Eg if we need | |
735 * CRAM_QS then we also need to know CRAM_RL so we know how long it | |
736 * is, or if we need FC/FP then we also need FN (number of features). | |
737 * | |
738 * It's not reciprocal though. We may be needing to decode FN | |
739 * but have no need to decode FC, FP and cigar ops. | |
740 */ | |
741 if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP; | |
742 if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP; | |
743 if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP; | |
744 if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP; | |
745 if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP; | |
746 if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP; | |
747 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP; | |
748 if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP; | |
749 if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP; | |
750 if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP; | |
751 if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP; | |
752 | |
753 // cram_decode_seq() needs seq[] array | |
754 if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL; | |
755 | |
756 if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC; | |
757 if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN; | |
758 if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL; | |
759 if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF; | |
760 if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF; | |
761 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI; | |
762 if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF)) | |
763 hdr->data_series |= CRAM_CF; | |
764 if (!hdr->read_names_included && hdr->data_series & CRAM_RN) | |
765 hdr->data_series |= CRAM_CF | CRAM_NF; | |
766 if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ)) | |
767 hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL; | |
768 | |
769 orig_ds = hdr->data_series; | |
770 | |
771 // Find which blocks are in use. | |
772 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) { | |
773 int bnum1, bnum2, j; | |
774 cram_codec *c = hdr->codecs[i_to_id[i]]; | |
775 | |
776 if (!(hdr->data_series & (1<<i))) | |
777 continue; | |
778 | |
779 if (!c) | |
780 continue; | |
781 | |
782 bnum1 = cram_codec_to_id(c, &bnum2); | |
783 | |
784 for (;;) { | |
785 switch (bnum1) { | |
786 case -2: | |
787 break; | |
788 | |
789 case -1: | |
790 core_used = 1; | |
791 break; | |
792 | |
793 default: | |
794 for (j = 0; j < s->hdr->num_blocks; j++) { | |
795 if (s->block[j]->content_type == EXTERNAL && | |
796 s->block[j]->content_id == bnum1) { | |
797 block_used[j] = 1; | |
798 if (cram_uncompress_block(s->block[j])) { | |
799 free(block_used); | |
800 return -1; | |
801 } | |
802 } | |
803 } | |
804 break; | |
805 } | |
806 | |
807 if (bnum2 == -2 || bnum1 == bnum2) | |
808 break; | |
809 | |
810 bnum1 = bnum2; // 2nd pass | |
811 } | |
812 } | |
813 | |
814 // Tags too | |
815 if ((fd->required_fields & SAM_AUX) || | |
816 (hdr->data_series & CRAM_aux)) { | |
817 for (i = 0; i < CRAM_MAP_HASH; i++) { | |
818 int bnum1, bnum2, j; | |
819 cram_map *m = hdr->tag_encoding_map[i]; | |
820 | |
821 while (m) { | |
822 cram_codec *c = m->codec; | |
823 if (!c) | |
824 continue; | |
825 | |
826 bnum1 = cram_codec_to_id(c, &bnum2); | |
827 | |
828 for (;;) { | |
829 switch (bnum1) { | |
830 case -2: | |
831 break; | |
832 | |
833 case -1: | |
834 core_used = 1; | |
835 break; | |
836 | |
837 default: | |
838 for (j = 0; j < s->hdr->num_blocks; j++) { | |
839 if (s->block[j]->content_type == EXTERNAL && | |
840 s->block[j]->content_id == bnum1) { | |
841 block_used[j] = 1; | |
842 if (cram_uncompress_block(s->block[j])) { | |
843 free(block_used); | |
844 return -1; | |
845 } | |
846 } | |
847 } | |
848 break; | |
849 } | |
850 | |
851 if (bnum2 == -2 || bnum1 == bnum2) | |
852 break; | |
853 | |
854 bnum1 = bnum2; // 2nd pass | |
855 } | |
856 | |
857 m = m->next; | |
858 } | |
859 } | |
860 } | |
861 | |
862 // We now know which blocks are in used, so repeat and find | |
863 // which other data series need to be added. | |
864 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) { | |
865 int bnum1, bnum2, j; | |
866 cram_codec *c = hdr->codecs[i_to_id[i]]; | |
867 | |
868 if (!c) | |
869 continue; | |
870 | |
871 bnum1 = cram_codec_to_id(c, &bnum2); | |
872 | |
873 for (;;) { | |
874 switch (bnum1) { | |
875 case -2: | |
876 break; | |
877 | |
878 case -1: | |
879 if (core_used) { | |
880 //printf(" + data series %08x:\n", 1<<i); | |
881 hdr->data_series |= 1<<i; | |
882 } | |
883 break; | |
884 | |
885 default: | |
886 for (j = 0; j < s->hdr->num_blocks; j++) { | |
887 if (s->block[j]->content_type == EXTERNAL && | |
888 s->block[j]->content_id == bnum1) { | |
889 if (block_used[j]) { | |
890 //printf(" + data series %08x:\n", 1<<i); | |
891 hdr->data_series |= 1<<i; | |
892 } | |
893 } | |
894 } | |
895 break; | |
896 } | |
897 | |
898 if (bnum2 == -2 || bnum1 == bnum2) | |
899 break; | |
900 | |
901 bnum1 = bnum2; // 2nd pass | |
902 } | |
903 } | |
904 | |
905 // Tags too | |
906 for (i = 0; i < CRAM_MAP_HASH; i++) { | |
907 int bnum1, bnum2, j; | |
908 cram_map *m = hdr->tag_encoding_map[i]; | |
909 | |
910 while (m) { | |
911 cram_codec *c = m->codec; | |
912 if (!c) | |
913 continue; | |
914 | |
915 bnum1 = cram_codec_to_id(c, &bnum2); | |
916 | |
917 for (;;) { | |
918 switch (bnum1) { | |
919 case -2: | |
920 break; | |
921 | |
922 case -1: | |
923 //printf(" + data series %08x:\n", CRAM_aux); | |
924 hdr->data_series |= CRAM_aux; | |
925 break; | |
926 | |
927 default: | |
928 for (j = 0; j < s->hdr->num_blocks; j++) { | |
929 if (s->block[j]->content_type && | |
930 s->block[j]->content_id == bnum1) { | |
931 if (block_used[j]) { | |
932 //printf(" + data series %08x:\n", | |
933 // CRAM_aux); | |
934 hdr->data_series |= CRAM_aux; | |
935 } | |
936 } | |
937 } | |
938 break; | |
939 } | |
940 | |
941 if (bnum2 == -2 || bnum1 == bnum2) | |
942 break; | |
943 | |
944 bnum1 = bnum2; // 2nd pass | |
945 } | |
946 | |
947 m = m->next; | |
948 } | |
949 } | |
950 } while (orig_ds != hdr->data_series); | |
951 | |
952 free(block_used); | |
953 return 0; | |
954 } | |
955 | |
956 /* ---------------------------------------------------------------------- | |
957 * CRAM slices | |
958 */ | |
959 | |
960 /* | |
961 * Decodes a CRAM (un)mapped slice header block. | |
962 * Returns slice header ptr on success | |
963 * NULL on failure | |
964 */ | |
965 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { | |
966 cram_block_slice_hdr *hdr; | |
967 char *cp = (char *)b->data; | |
968 int i; | |
969 | |
970 if (b->content_type != MAPPED_SLICE && | |
971 b->content_type != UNMAPPED_SLICE) | |
972 return NULL; | |
973 | |
974 if (!(hdr = calloc(1, sizeof(*hdr)))) | |
975 return NULL; | |
976 | |
977 hdr->content_type = b->content_type; | |
978 | |
979 if (b->content_type == MAPPED_SLICE) { | |
980 cp += itf8_get(cp, &hdr->ref_seq_id); | |
981 cp += itf8_get(cp, &hdr->ref_seq_start); | |
982 cp += itf8_get(cp, &hdr->ref_seq_span); | |
983 } | |
984 cp += itf8_get(cp, &hdr->num_records); | |
985 hdr->record_counter = 0; | |
986 if (CRAM_MAJOR_VERS(fd->version) == 2) { | |
987 int32_t i32; | |
988 cp += itf8_get(cp, &i32); | |
989 hdr->record_counter = i32; | |
990 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
991 cp += ltf8_get(cp, &hdr->record_counter); | |
992 } | |
993 | |
994 cp += itf8_get(cp, &hdr->num_blocks); | |
995 | |
996 cp += itf8_get(cp, &hdr->num_content_ids); | |
997 hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t)); | |
998 if (!hdr->block_content_ids) { | |
999 free(hdr); | |
1000 return NULL; | |
1001 } | |
1002 | |
1003 for (i = 0; i < hdr->num_content_ids; i++) { | |
1004 cp += itf8_get(cp, &hdr->block_content_ids[i]); | |
1005 } | |
1006 | |
1007 if (b->content_type == MAPPED_SLICE) { | |
1008 cp += itf8_get(cp, &hdr->ref_base_id); | |
1009 } | |
1010 | |
1011 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
1012 memcpy(hdr->md5, cp, 16); | |
1013 } else { | |
1014 memset(hdr->md5, 0, 16); | |
1015 } | |
1016 | |
1017 return hdr; | |
1018 } | |
1019 | |
1020 | |
1021 #if 0 | |
1022 /* Returns the number of bits set in val; it the highest bit used */ | |
1023 static int nbits(int v) { | |
1024 static const int MultiplyDeBruijnBitPosition[32] = { | |
1025 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31, | |
1026 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32 | |
1027 }; | |
1028 | |
1029 v |= v >> 1; // first up to set all bits 1 after the first 1 */ | |
1030 v |= v >> 2; | |
1031 v |= v >> 4; | |
1032 v |= v >> 8; | |
1033 v |= v >> 16; | |
1034 | |
1035 // DeBruijn magic to find top bit | |
1036 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; | |
1037 } | |
1038 #endif | |
1039 | |
1040 #if 0 | |
1041 static int sort_freqs(const void *vp1, const void *vp2) { | |
1042 const int i1 = *(const int *)vp1; | |
1043 const int i2 = *(const int *)vp2; | |
1044 return i1-i2; | |
1045 } | |
1046 #endif | |
1047 | |
1048 /* ---------------------------------------------------------------------- | |
1049 * Primary CRAM sequence decoder | |
1050 */ | |
1051 | |
1052 /* | |
1053 * Internal part of cram_decode_slice(). | |
1054 * Generates the sequence, quality and cigar components. | |
1055 */ | |
1056 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, | |
1057 cram_block *blk, cram_record *cr, SAM_hdr *bfd, | |
1058 int cf, char *seq, char *qual) { | |
1059 int prev_pos = 0, f, r = 0, out_sz = 1; | |
1060 int seq_pos = 1; | |
1061 int cig_len = 0, ref_pos = cr->apos; | |
1062 int32_t fn, i32; | |
1063 enum cigar_op cig_op = BAM_CMATCH; | |
1064 uint32_t *cigar = s->cigar; | |
1065 uint32_t ncigar = s->ncigar; | |
1066 uint32_t cigar_alloc = s->cigar_alloc; | |
1067 uint32_t nm = 0, md_dist = 0; | |
1068 int orig_aux = 0; | |
1069 int decode_md = fd->decode_md && s->ref; | |
1070 uint32_t ds = c->comp_hdr->data_series; | |
1071 | |
1072 if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { | |
1073 memset(qual, 30, cr->len); | |
1074 } | |
1075 | |
1076 if (decode_md) { | |
1077 orig_aux = BLOCK_SIZE(s->aux_blk); | |
1078 BLOCK_APPEND(s->aux_blk, "MDZ", 3); | |
1079 } | |
1080 | |
1081 if (ds & CRAM_FN) { | |
1082 if (!c->comp_hdr->codecs[DS_FN]) return -1; | |
1083 r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN], | |
1084 blk, (char *)&fn, &out_sz); | |
1085 } else { | |
1086 fn = 0; | |
1087 } | |
1088 | |
1089 ref_pos--; // count from 0 | |
1090 cr->cigar = ncigar; | |
1091 | |
1092 if (!(ds & (CRAM_FC | CRAM_FP))) | |
1093 goto skip_cigar; | |
1094 | |
1095 for (f = 0; f < fn; f++) { | |
1096 int32_t pos = 0; | |
1097 char op; | |
1098 | |
1099 if (ncigar+2 >= cigar_alloc) { | |
1100 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; | |
1101 s->cigar = cigar; | |
1102 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) | |
1103 return -1; | |
1104 } | |
1105 | |
1106 if (ds & CRAM_FC) { | |
1107 if (!c->comp_hdr->codecs[DS_FC]) return -1; | |
1108 r |= c->comp_hdr->codecs[DS_FC]->decode(s, | |
1109 c->comp_hdr->codecs[DS_FC], | |
1110 blk, | |
1111 &op, &out_sz); | |
1112 } | |
1113 | |
1114 if (!(ds & CRAM_FP)) | |
1115 continue; | |
1116 | |
1117 if (!c->comp_hdr->codecs[DS_FP]) return -1; | |
1118 r |= c->comp_hdr->codecs[DS_FP]->decode(s, | |
1119 c->comp_hdr->codecs[DS_FP], | |
1120 blk, | |
1121 (char *)&pos, &out_sz); | |
1122 pos += prev_pos; | |
1123 | |
1124 if (pos > seq_pos) { | |
1125 if (pos > cr->len+1) | |
1126 return -1; | |
1127 | |
1128 if (s->ref && cr->ref_id >= 0) { | |
1129 if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) { | |
1130 static int whinged = 0; | |
1131 if (!whinged) | |
1132 fprintf(stderr, "Ref pos outside of ref " | |
1133 "sequence boundary\n"); | |
1134 whinged = 1; | |
1135 } else { | |
1136 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], | |
1137 pos - seq_pos); | |
1138 } | |
1139 } | |
1140 #ifdef USE_X | |
1141 if (cig_len && cig_op != BAM_CBASE_MATCH) { | |
1142 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1143 cig_len = 0; | |
1144 } | |
1145 cig_op = BAM_CBASE_MATCH; | |
1146 #else | |
1147 if (cig_len && cig_op != BAM_CMATCH) { | |
1148 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1149 cig_len = 0; | |
1150 } | |
1151 cig_op = BAM_CMATCH; | |
1152 #endif | |
1153 cig_len += pos - seq_pos; | |
1154 ref_pos += pos - seq_pos; | |
1155 md_dist += pos - seq_pos; | |
1156 seq_pos = pos; | |
1157 } | |
1158 | |
1159 prev_pos = pos; | |
1160 | |
1161 if (!(ds & CRAM_FC)) | |
1162 goto skip_cigar; | |
1163 | |
1164 if (!(ds & CRAM_FC)) | |
1165 continue; | |
1166 | |
1167 switch(op) { | |
1168 case 'S': { // soft clip: IN | |
1169 int32_t out_sz2 = 1; | |
1170 | |
1171 if (cig_len) { | |
1172 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1173 cig_len = 0; | |
1174 } | |
1175 if (ds & CRAM_IN) { | |
1176 switch (CRAM_MAJOR_VERS(fd->version)) { | |
1177 case 1: | |
1178 r |= c->comp_hdr->codecs[DS_IN] | |
1179 ? c->comp_hdr->codecs[DS_IN] | |
1180 ->decode(s, c->comp_hdr->codecs[DS_IN], | |
1181 blk, &seq[pos-1], &out_sz2) | |
1182 : (seq[pos-1] = 'N', out_sz2 = 1, 0); | |
1183 break; | |
1184 | |
1185 case 2: | |
1186 default: | |
1187 r |= c->comp_hdr->codecs[DS_SC] | |
1188 ? c->comp_hdr->codecs[DS_SC] | |
1189 ->decode(s, c->comp_hdr->codecs[DS_SC], | |
1190 blk, &seq[pos-1], &out_sz2) | |
1191 : (seq[pos-1] = 'N', out_sz2 = 1, 0); | |
1192 break; | |
1193 | |
1194 // default: | |
1195 // r |= c->comp_hdr->codecs[DS_BB] | |
1196 // ? c->comp_hdr->codecs[DS_BB] | |
1197 // ->decode(s, c->comp_hdr->codecs[DS_BB], | |
1198 // blk, &seq[pos-1], &out_sz2) | |
1199 // : (seq[pos-1] = 'N', out_sz2 = 1, 0); | |
1200 } | |
1201 cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; | |
1202 cig_op = BAM_CSOFT_CLIP; | |
1203 seq_pos += out_sz2; | |
1204 } | |
1205 break; | |
1206 } | |
1207 | |
1208 case 'X': { // Substitution; BS | |
1209 unsigned char base; | |
1210 #ifdef USE_X | |
1211 if (cig_len && cig_op != BAM_CBASE_MISMATCH) { | |
1212 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1213 cig_len = 0; | |
1214 } | |
1215 if (ds & CRAM_BS) { | |
1216 if (!c->comp_hdr->codecs[DS_BS]) return -1; | |
1217 r |= c->comp_hdr->codecs[DS_BS] | |
1218 ->decode(s, c->comp_hdr->codecs[DS_BS], blk, | |
1219 (char *)&base, &out_sz); | |
1220 seq[pos-1] = 'N'; // FIXME look up BS=base value | |
1221 } | |
1222 cig_op = BAM_CBASE_MISMATCH; | |
1223 #else | |
1224 int ref_base; | |
1225 if (cig_len && cig_op != BAM_CMATCH) { | |
1226 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1227 cig_len = 0; | |
1228 } | |
1229 if (ds & CRAM_BS) { | |
1230 if (!c->comp_hdr->codecs[DS_BS]) return -1; | |
1231 r |= c->comp_hdr->codecs[DS_BS] | |
1232 ->decode(s, c->comp_hdr->codecs[DS_BS], blk, | |
1233 (char *)&base, &out_sz); | |
1234 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { | |
1235 seq[pos-1] = 'N'; | |
1236 } else { | |
1237 ref_base = fd->L1[(uc)s->ref[ref_pos - s->ref_start +1]]; | |
1238 seq[pos-1] = c->comp_hdr-> | |
1239 substitution_matrix[ref_base][base]; | |
1240 if (decode_md) { | |
1241 BLOCK_APPEND_UINT(s->aux_blk, md_dist); | |
1242 BLOCK_APPEND_CHAR(s->aux_blk, | |
1243 s->ref[ref_pos-s->ref_start +1]); | |
1244 md_dist = 0; | |
1245 } | |
1246 } | |
1247 } | |
1248 cig_op = BAM_CMATCH; | |
1249 #endif | |
1250 nm++; | |
1251 cig_len++; | |
1252 seq_pos++; | |
1253 ref_pos++; | |
1254 break; | |
1255 } | |
1256 | |
1257 case 'D': { // Deletion; DL | |
1258 if (cig_len && cig_op != BAM_CDEL) { | |
1259 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1260 cig_len = 0; | |
1261 } | |
1262 if (ds & CRAM_DL) { | |
1263 if (!c->comp_hdr->codecs[DS_DL]) return -1; | |
1264 r |= c->comp_hdr->codecs[DS_DL] | |
1265 ->decode(s, c->comp_hdr->codecs[DS_DL], blk, | |
1266 (char *)&i32, &out_sz); | |
1267 if (decode_md) { | |
1268 BLOCK_APPEND_UINT(s->aux_blk, md_dist); | |
1269 BLOCK_APPEND_CHAR(s->aux_blk, '^'); | |
1270 BLOCK_APPEND(s->aux_blk, | |
1271 &s->ref[ref_pos - s->ref_start +1], | |
1272 i32); | |
1273 md_dist = 0; | |
1274 } | |
1275 cig_op = BAM_CDEL; | |
1276 cig_len += i32; | |
1277 ref_pos += i32; | |
1278 nm += i32; | |
1279 //printf(" %d: DL = %d (ret %d)\n", f, i32, r); | |
1280 } | |
1281 break; | |
1282 } | |
1283 | |
1284 case 'I': { // Insertion (several bases); IN | |
1285 int32_t out_sz2 = 1; | |
1286 | |
1287 if (cig_len && cig_op != BAM_CINS) { | |
1288 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1289 cig_len = 0; | |
1290 } | |
1291 | |
1292 if (ds & CRAM_IN) { | |
1293 if (!c->comp_hdr->codecs[DS_IN]) return -1; | |
1294 r |= c->comp_hdr->codecs[DS_IN] | |
1295 ->decode(s, c->comp_hdr->codecs[DS_IN], blk, | |
1296 &seq[pos-1], &out_sz2); | |
1297 cig_op = BAM_CINS; | |
1298 cig_len += out_sz2; | |
1299 seq_pos += out_sz2; | |
1300 nm += out_sz2; | |
1301 //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2); | |
1302 } | |
1303 break; | |
1304 } | |
1305 | |
1306 case 'i': { // Insertion (single base); BA | |
1307 if (cig_len && cig_op != BAM_CINS) { | |
1308 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1309 cig_len = 0; | |
1310 } | |
1311 if (ds & CRAM_BA) { | |
1312 if (!c->comp_hdr->codecs[DS_BA]) return -1; | |
1313 r |= c->comp_hdr->codecs[DS_BA] | |
1314 ->decode(s, c->comp_hdr->codecs[DS_BA], blk, | |
1315 (char *)&seq[pos-1], &out_sz); | |
1316 } | |
1317 cig_op = BAM_CINS; | |
1318 cig_len++; | |
1319 seq_pos++; | |
1320 nm++; | |
1321 break; | |
1322 } | |
1323 | |
1324 case 'b': { // Several bases | |
1325 int32_t len = 1; | |
1326 | |
1327 if (cig_len && cig_op != BAM_CMATCH) { | |
1328 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1329 cig_len = 0; | |
1330 } | |
1331 | |
1332 if (ds & CRAM_BB) { | |
1333 if (!c->comp_hdr->codecs[DS_BB]) return -1; | |
1334 r |= c->comp_hdr->codecs[DS_BB] | |
1335 ->decode(s, c->comp_hdr->codecs[DS_BB], blk, | |
1336 (char *)&seq[pos-1], &len); | |
1337 } | |
1338 | |
1339 cig_op = BAM_CMATCH; | |
1340 | |
1341 cig_len+=len; | |
1342 seq_pos+=len; | |
1343 ref_pos+=len; | |
1344 //prev_pos+=len; | |
1345 break; | |
1346 } | |
1347 | |
1348 case 'q': { // Several quality values | |
1349 int32_t len = 1; | |
1350 | |
1351 if (cig_len && cig_op != BAM_CMATCH) { | |
1352 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1353 cig_len = 0; | |
1354 } | |
1355 | |
1356 if (ds & CRAM_QQ) { | |
1357 if (!c->comp_hdr->codecs[DS_QQ]) return -1; | |
1358 r |= c->comp_hdr->codecs[DS_QQ] | |
1359 ->decode(s, c->comp_hdr->codecs[DS_QQ], blk, | |
1360 (char *)&qual[pos-1], &len); | |
1361 } | |
1362 | |
1363 cig_op = BAM_CMATCH; | |
1364 | |
1365 cig_len+=len; | |
1366 seq_pos+=len; | |
1367 ref_pos+=len; | |
1368 //prev_pos+=len; | |
1369 break; | |
1370 } | |
1371 | |
1372 case 'B': { // Read base; BA, QS | |
1373 #ifdef USE_X | |
1374 if (cig_len && cig_op != BAM_CBASE_MISMATCH) { | |
1375 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1376 cig_len = 0; | |
1377 } | |
1378 #else | |
1379 if (cig_len && cig_op != BAM_CMATCH) { | |
1380 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1381 cig_len = 0; | |
1382 } | |
1383 #endif | |
1384 if (ds & CRAM_BA) { | |
1385 if (!c->comp_hdr->codecs[DS_BA]) return -1; | |
1386 r |= c->comp_hdr->codecs[DS_BA] | |
1387 ->decode(s, c->comp_hdr->codecs[DS_BA], blk, | |
1388 (char *)&seq[pos-1], &out_sz); | |
1389 } | |
1390 if (ds & CRAM_QS) { | |
1391 if (!c->comp_hdr->codecs[DS_QS]) return -1; | |
1392 r |= c->comp_hdr->codecs[DS_QS] | |
1393 ->decode(s, c->comp_hdr->codecs[DS_QS], blk, | |
1394 (char *)&qual[pos-1], &out_sz); | |
1395 } | |
1396 #ifdef USE_X | |
1397 cig_op = BAM_CBASE_MISMATCH; | |
1398 #else | |
1399 cig_op = BAM_CMATCH; | |
1400 #endif | |
1401 cig_len++; | |
1402 seq_pos++; | |
1403 ref_pos++; | |
1404 //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r); | |
1405 break; | |
1406 } | |
1407 | |
1408 case 'Q': { // Quality score; QS | |
1409 if (ds & CRAM_QS) { | |
1410 if (!c->comp_hdr->codecs[DS_QS]) return -1; | |
1411 r |= c->comp_hdr->codecs[DS_QS] | |
1412 ->decode(s, c->comp_hdr->codecs[DS_QS], blk, | |
1413 (char *)&qual[pos-1], &out_sz); | |
1414 //printf(" %d: QS = %d (ret %d)\n", f, qc, r); | |
1415 } | |
1416 break; | |
1417 } | |
1418 | |
1419 case 'H': { // hard clip; HC | |
1420 if (cig_len && cig_op != BAM_CHARD_CLIP) { | |
1421 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1422 cig_len = 0; | |
1423 } | |
1424 if (ds & CRAM_HC) { | |
1425 if (!c->comp_hdr->codecs[DS_HC]) return -1; | |
1426 r |= c->comp_hdr->codecs[DS_HC] | |
1427 ->decode(s, c->comp_hdr->codecs[DS_HC], blk, | |
1428 (char *)&i32, &out_sz); | |
1429 cig_op = BAM_CHARD_CLIP; | |
1430 cig_len += i32; | |
1431 nm += i32; | |
1432 } | |
1433 break; | |
1434 } | |
1435 | |
1436 case 'P': { // padding; PD | |
1437 if (cig_len && cig_op != BAM_CPAD) { | |
1438 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1439 cig_len = 0; | |
1440 } | |
1441 if (ds & CRAM_PD) { | |
1442 if (!c->comp_hdr->codecs[DS_PD]) return -1; | |
1443 r |= c->comp_hdr->codecs[DS_PD] | |
1444 ->decode(s, c->comp_hdr->codecs[DS_PD], blk, | |
1445 (char *)&i32, &out_sz); | |
1446 cig_op = BAM_CPAD; | |
1447 cig_len += i32; | |
1448 nm += i32; | |
1449 } | |
1450 break; | |
1451 } | |
1452 | |
1453 case 'N': { // Ref skip; RS | |
1454 if (cig_len && cig_op != BAM_CREF_SKIP) { | |
1455 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1456 cig_len = 0; | |
1457 } | |
1458 if (ds & CRAM_RS) { | |
1459 if (!c->comp_hdr->codecs[DS_RS]) return -1; | |
1460 r |= c->comp_hdr->codecs[DS_RS] | |
1461 ->decode(s, c->comp_hdr->codecs[DS_RS], blk, | |
1462 (char *)&i32, &out_sz); | |
1463 cig_op = BAM_CREF_SKIP; | |
1464 cig_len += i32; | |
1465 ref_pos += i32; | |
1466 nm += i32; | |
1467 } | |
1468 break; | |
1469 } | |
1470 | |
1471 default: | |
1472 abort(); | |
1473 } | |
1474 } | |
1475 | |
1476 if (!(ds & CRAM_FC)) | |
1477 goto skip_cigar; | |
1478 | |
1479 /* An implement match op for any unaccounted for bases */ | |
1480 if ((ds & CRAM_FN) && cr->len >= seq_pos) { | |
1481 if (s->ref) { | |
1482 if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) { | |
1483 static int whinged = 0; | |
1484 if (!whinged) | |
1485 fprintf(stderr, "Ref pos outside of ref sequence boundary\n"); | |
1486 whinged = 1; | |
1487 } else { | |
1488 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], | |
1489 cr->len - seq_pos + 1); | |
1490 ref_pos += cr->len - seq_pos + 1; | |
1491 md_dist += cr->len - seq_pos + 1; | |
1492 } | |
1493 } | |
1494 | |
1495 if (ncigar+1 >= cigar_alloc) { | |
1496 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; | |
1497 s->cigar = cigar; | |
1498 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) | |
1499 return -1; | |
1500 } | |
1501 #ifdef USE_X | |
1502 if (cig_len && cig_op != BAM_CBASE_MATCH) { | |
1503 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1504 cig_len = 0; | |
1505 } | |
1506 cig_op = BAM_CBASE_MATCH; | |
1507 #else | |
1508 if (cig_len && cig_op != BAM_CMATCH) { | |
1509 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1510 cig_len = 0; | |
1511 } | |
1512 cig_op = BAM_CMATCH; | |
1513 #endif | |
1514 cig_len += cr->len - seq_pos+1; | |
1515 } | |
1516 | |
1517 skip_cigar: | |
1518 | |
1519 if ((ds & CRAM_FN) && decode_md) { | |
1520 BLOCK_APPEND_UINT(s->aux_blk, md_dist); | |
1521 } | |
1522 | |
1523 if (cig_len) { | |
1524 if (ncigar >= cigar_alloc) { | |
1525 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; | |
1526 s->cigar = cigar; | |
1527 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) | |
1528 return -1; | |
1529 } | |
1530 | |
1531 cigar[ncigar++] = (cig_len<<4) + cig_op; | |
1532 } | |
1533 | |
1534 cr->ncigar = ncigar - cr->cigar; | |
1535 cr->aend = ref_pos; | |
1536 | |
1537 //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); | |
1538 | |
1539 if (ds & CRAM_MQ) { | |
1540 if (!c->comp_hdr->codecs[DS_MQ]) return -1; | |
1541 r |= c->comp_hdr->codecs[DS_MQ] | |
1542 ->decode(s, c->comp_hdr->codecs[DS_MQ], blk, | |
1543 (char *)&cr->mqual, &out_sz); | |
1544 } else { | |
1545 cr->mqual = 40; | |
1546 } | |
1547 | |
1548 if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { | |
1549 int32_t out_sz2 = cr->len; | |
1550 | |
1551 if (ds & CRAM_QS) { | |
1552 if (!c->comp_hdr->codecs[DS_QS]) return -1; | |
1553 r |= c->comp_hdr->codecs[DS_QS] | |
1554 ->decode(s, c->comp_hdr->codecs[DS_QS], blk, | |
1555 qual, &out_sz2); | |
1556 } | |
1557 } | |
1558 | |
1559 s->cigar = cigar; | |
1560 s->cigar_alloc = cigar_alloc; | |
1561 s->ncigar = ncigar; | |
1562 | |
1563 if (decode_md) { | |
1564 char buf[7]; | |
1565 BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z: | |
1566 cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux; | |
1567 buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I'; | |
1568 buf[3] = (nm>> 0) & 0xff; | |
1569 buf[4] = (nm>> 8) & 0xff; | |
1570 buf[5] = (nm>>16) & 0xff; | |
1571 buf[6] = (nm>>24) & 0xff; | |
1572 BLOCK_APPEND(s->aux_blk, buf, 7); | |
1573 cr->aux_size += 7; | |
1574 } | |
1575 | |
1576 return r; | |
1577 } | |
1578 | |
1579 /* | |
1580 * Quick and simple hash lookup for cram_map arrays | |
1581 */ | |
1582 static cram_map *map_find(cram_map **map, unsigned char *key, int id) { | |
1583 cram_map *m; | |
1584 | |
1585 m = map[CRAM_MAP(key[0],key[1])]; | |
1586 while (m && m->key != id) | |
1587 m= m->next; | |
1588 | |
1589 return m; | |
1590 } | |
1591 | |
1592 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next | |
1593 | |
1594 | |
1595 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, | |
1596 cram_block *blk, cram_record *cr) { | |
1597 int i, r = 0, out_sz = 1; | |
1598 unsigned char ntags; | |
1599 | |
1600 if (!c->comp_hdr->codecs[DS_TC]) return -1; | |
1601 r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk, | |
1602 (char *)&ntags, &out_sz); | |
1603 cr->ntags = ntags; | |
1604 | |
1605 //printf("TC=%d\n", cr->ntags); | |
1606 cr->aux_size = 0; | |
1607 cr->aux = BLOCK_SIZE(s->aux_blk); | |
1608 | |
1609 for (i = 0; i < cr->ntags; i++) { | |
1610 int32_t id, out_sz = 1; | |
1611 unsigned char tag_data[3]; | |
1612 cram_map *m; | |
1613 | |
1614 //printf("Tag %d/%d\n", i+1, cr->ntags); | |
1615 if (!c->comp_hdr->codecs[DS_TN]) return -1; | |
1616 r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN], | |
1617 blk, (char *)&id, &out_sz); | |
1618 if (out_sz == 3) { | |
1619 tag_data[0] = ((char *)&id)[0]; | |
1620 tag_data[1] = ((char *)&id)[1]; | |
1621 tag_data[2] = ((char *)&id)[2]; | |
1622 } else { | |
1623 tag_data[0] = (id>>16) & 0xff; | |
1624 tag_data[1] = (id>>8) & 0xff; | |
1625 tag_data[2] = id & 0xff; | |
1626 } | |
1627 | |
1628 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); | |
1629 if (!m) | |
1630 return -1; | |
1631 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); | |
1632 | |
1633 if (!m->codec) return -1; | |
1634 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); | |
1635 | |
1636 cr->aux_size += out_sz + 3; | |
1637 } | |
1638 | |
1639 return r; | |
1640 } | |
1641 | |
1642 static int cram_decode_aux(cram_container *c, cram_slice *s, | |
1643 cram_block *blk, cram_record *cr) { | |
1644 int i, r = 0, out_sz = 1; | |
1645 int32_t TL = 0; | |
1646 unsigned char *TN; | |
1647 uint32_t ds = c->comp_hdr->data_series; | |
1648 | |
1649 if (!(ds & (CRAM_TL|CRAM_aux))) { | |
1650 cr->aux = 0; | |
1651 cr->aux_size = 0; | |
1652 return 0; | |
1653 } | |
1654 | |
1655 if (!c->comp_hdr->codecs[DS_TL]) return -1; | |
1656 r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk, | |
1657 (char *)&TL, &out_sz); | |
1658 if (r || TL < 0 || TL >= c->comp_hdr->nTL) | |
1659 return -1; | |
1660 | |
1661 TN = c->comp_hdr->TL[TL]; | |
1662 cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen | |
1663 | |
1664 //printf("TC=%d\n", cr->ntags); | |
1665 cr->aux_size = 0; | |
1666 cr->aux = BLOCK_SIZE(s->aux_blk); | |
1667 | |
1668 if (!(ds & CRAM_aux)) | |
1669 return 0; | |
1670 | |
1671 for (i = 0; i < cr->ntags; i++) { | |
1672 int32_t id, out_sz = 1; | |
1673 unsigned char tag_data[3]; | |
1674 cram_map *m; | |
1675 | |
1676 //printf("Tag %d/%d\n", i+1, cr->ntags); | |
1677 tag_data[0] = *TN++; | |
1678 tag_data[1] = *TN++; | |
1679 tag_data[2] = *TN++; | |
1680 id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2]; | |
1681 | |
1682 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); | |
1683 if (!m) | |
1684 return -1; | |
1685 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); | |
1686 | |
1687 if (!m->codec) return -1; | |
1688 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); | |
1689 cr->aux_size += out_sz + 3; | |
1690 } | |
1691 | |
1692 return r; | |
1693 } | |
1694 | |
1695 /* Resolve mate pair cross-references between recs within this slice */ | |
1696 static void cram_decode_slice_xref(cram_slice *s, int required_fields) { | |
1697 int rec; | |
1698 | |
1699 if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) { | |
1700 for (rec = 0; rec < s->hdr->num_records; rec++) { | |
1701 cram_record *cr = &s->crecs[rec]; | |
1702 | |
1703 cr->tlen = 0; | |
1704 cr->mate_pos = 0; | |
1705 cr->mate_ref_id = -1; | |
1706 } | |
1707 | |
1708 return; | |
1709 } | |
1710 | |
1711 for (rec = 0; rec < s->hdr->num_records; rec++) { | |
1712 cram_record *cr = &s->crecs[rec]; | |
1713 | |
1714 if (cr->mate_line >= 0) { | |
1715 if (cr->mate_line < s->hdr->num_records) { | |
1716 /* | |
1717 * On the first read, loop through computing lengths. | |
1718 * It's not perfect as we have one slice per reference so we | |
1719 * cannot detect when TLEN should be zero due to seqs that | |
1720 * map to multiple references. | |
1721 * | |
1722 * We also cannot set tlen correct when it spans a slice for | |
1723 * other reasons. This may make tlen too small. Should we | |
1724 * fix this by forcing TLEN to be stored verbatim in such cases? | |
1725 * | |
1726 * Or do we just admit defeat and output 0 for tlen? It's the | |
1727 * safe option... | |
1728 */ | |
1729 if (cr->tlen == INT_MIN) { | |
1730 int id1 = rec, id2 = rec; | |
1731 int aleft = cr->apos, aright = cr->aend; | |
1732 int tlen; | |
1733 int ref = cr->ref_id; | |
1734 | |
1735 // number of segments starting at the same point. | |
1736 int left_cnt = 0; | |
1737 | |
1738 do { | |
1739 if (aleft > s->crecs[id2].apos) | |
1740 aleft = s->crecs[id2].apos, left_cnt = 1; | |
1741 else if (aleft == s->crecs[id2].apos) | |
1742 left_cnt++; | |
1743 if (aright < s->crecs[id2].aend) | |
1744 aright = s->crecs[id2].aend; | |
1745 if (s->crecs[id2].mate_line == -1) { | |
1746 s->crecs[id2].mate_line = rec; | |
1747 break; | |
1748 } | |
1749 assert(s->crecs[id2].mate_line > id2); | |
1750 id2 = s->crecs[id2].mate_line; | |
1751 | |
1752 if (s->crecs[id2].ref_id != ref) | |
1753 ref = -1; | |
1754 } while (id2 != id1); | |
1755 | |
1756 if (ref != -1) { | |
1757 tlen = aright - aleft + 1; | |
1758 id1 = id2 = rec; | |
1759 | |
1760 /* | |
1761 * When we have two seqs with identical start and | |
1762 * end coordinates, set +/- tlen based on 1st/last | |
1763 * bit flags instead, as a tie breaker. | |
1764 */ | |
1765 if (s->crecs[id2].apos == aleft) { | |
1766 if (left_cnt == 1 || | |
1767 (s->crecs[id2].flags & BAM_FREAD1)) | |
1768 s->crecs[id2].tlen = tlen; | |
1769 else | |
1770 s->crecs[id2].tlen = -tlen; | |
1771 } else { | |
1772 s->crecs[id2].tlen = -tlen; | |
1773 } | |
1774 | |
1775 id2 = s->crecs[id2].mate_line; | |
1776 while (id2 != id1) { | |
1777 if (s->crecs[id2].apos == aleft) { | |
1778 if (left_cnt == 1 || | |
1779 (s->crecs[id2].flags & BAM_FREAD1)) | |
1780 s->crecs[id2].tlen = tlen; | |
1781 else | |
1782 s->crecs[id2].tlen = -tlen; | |
1783 } else { | |
1784 s->crecs[id2].tlen = -tlen; | |
1785 } | |
1786 id2 = s->crecs[id2].mate_line; | |
1787 } | |
1788 } else { | |
1789 id1 = id2 = rec; | |
1790 | |
1791 s->crecs[id2].tlen = 0; | |
1792 id2 = s->crecs[id2].mate_line; | |
1793 while (id2 != id1) { | |
1794 s->crecs[id2].tlen = 0; | |
1795 id2 = s->crecs[id2].mate_line; | |
1796 } | |
1797 } | |
1798 } | |
1799 | |
1800 cr->mate_pos = s->crecs[cr->mate_line].apos; | |
1801 cr->mate_ref_id = s->crecs[cr->mate_line].ref_id; | |
1802 | |
1803 // paired | |
1804 cr->flags |= BAM_FPAIRED; | |
1805 | |
1806 // set mate unmapped if needed | |
1807 if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) { | |
1808 cr->flags |= BAM_FMUNMAP; | |
1809 cr->tlen = 0; | |
1810 } | |
1811 if (cr->flags & BAM_FUNMAP) { | |
1812 cr->tlen = 0; | |
1813 } | |
1814 | |
1815 // set mate reversed if needed | |
1816 if (s->crecs[cr->mate_line].flags & BAM_FREVERSE) | |
1817 cr->flags |= BAM_FMREVERSE; | |
1818 } else { | |
1819 fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n", | |
1820 cr->mate_line, s->hdr->num_records-1); | |
1821 } | |
1822 | |
1823 /* FIXME: construct read names here too if needed */ | |
1824 } else { | |
1825 if (cr->mate_flags & CRAM_M_REVERSE) { | |
1826 cr->flags |= BAM_FPAIRED | BAM_FMREVERSE; | |
1827 } | |
1828 if (cr->mate_flags & CRAM_M_UNMAP) { | |
1829 cr->flags |= BAM_FMUNMAP; | |
1830 //cr->mate_ref_id = -1; | |
1831 } | |
1832 if (!(cr->flags & BAM_FPAIRED)) | |
1833 cr->mate_ref_id = -1; | |
1834 } | |
1835 | |
1836 if (cr->tlen == INT_MIN) | |
1837 cr->tlen = 0; // Just incase | |
1838 } | |
1839 } | |
1840 | |
1841 static char *md5_print(unsigned char *md5, char *out) { | |
1842 int i; | |
1843 for (i = 0; i < 16; i++) { | |
1844 out[i*2+0] = "0123456789abcdef"[md5[i]>>4]; | |
1845 out[i*2+1] = "0123456789abcdef"[md5[i]&15]; | |
1846 } | |
1847 out[32] = 0; | |
1848 | |
1849 return out; | |
1850 } | |
1851 | |
1852 /* | |
1853 * Decode an entire slice from container blocks. Fills out s->crecs[] array. | |
1854 * Returns 0 on success | |
1855 * -1 on failure | |
1856 */ | |
1857 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, | |
1858 SAM_hdr *bfd) { | |
1859 cram_block *blk = s->block[0]; | |
1860 int32_t bf, ref_id; | |
1861 unsigned char cf; | |
1862 int out_sz, r = 0; | |
1863 int rec; | |
1864 char *seq = NULL, *qual = NULL; | |
1865 int unknown_rg = -1; | |
1866 int embed_ref; | |
1867 char **refs = NULL; | |
1868 uint32_t ds; | |
1869 | |
1870 if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0) | |
1871 return -1; | |
1872 | |
1873 ds = c->comp_hdr->data_series; | |
1874 | |
1875 blk->bit = 7; // MSB first | |
1876 | |
1877 /* Look for unknown RG, added as last by Java CRAM? */ | |
1878 if (bfd->nrg > 0 && | |
1879 !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN")) | |
1880 unknown_rg = bfd->nrg-1; | |
1881 | |
1882 if (blk->content_type != CORE) | |
1883 return -1; | |
1884 | |
1885 if (s->crecs) | |
1886 free(s->crecs); | |
1887 if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs)))) | |
1888 return -1; | |
1889 | |
1890 ref_id = s->hdr->ref_seq_id; | |
1891 embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; | |
1892 | |
1893 if (ref_id >= 0) { | |
1894 if (embed_ref) { | |
1895 cram_block *b; | |
1896 if (s->hdr->ref_base_id < 0) { | |
1897 fprintf(stderr, "No reference specified and " | |
1898 "no embedded reference is available.\n"); | |
1899 return -1; | |
1900 } | |
1901 if (!s->block_by_id || | |
1902 !(b = s->block_by_id[s->hdr->ref_base_id])) | |
1903 return -1; | |
1904 cram_uncompress_block(b); | |
1905 s->ref = (char *)BLOCK_DATA(b); | |
1906 s->ref_start = s->hdr->ref_seq_start; | |
1907 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; | |
1908 } else if (!fd->no_ref) { | |
1909 //// Avoid Java cramtools bug by loading entire reference seq | |
1910 //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0); | |
1911 //s->ref_start = 1; | |
1912 | |
1913 if (fd->required_fields & SAM_SEQ) | |
1914 s->ref = | |
1915 cram_get_ref(fd, s->hdr->ref_seq_id, | |
1916 s->hdr->ref_seq_start, | |
1917 s->hdr->ref_seq_start + s->hdr->ref_seq_span -1); | |
1918 s->ref_start = s->hdr->ref_seq_start; | |
1919 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; | |
1920 | |
1921 /* Sanity check */ | |
1922 if (s->ref_start < 0) { | |
1923 fprintf(stderr, "Slice starts before base 1.\n"); | |
1924 s->ref_start = 0; | |
1925 } | |
1926 pthread_mutex_lock(&fd->ref_lock); | |
1927 pthread_mutex_lock(&fd->refs->lock); | |
1928 if ((fd->required_fields & SAM_SEQ) && | |
1929 s->ref_end > fd->refs->ref_id[ref_id]->length) { | |
1930 fprintf(stderr, "Slice ends beyond reference end.\n"); | |
1931 s->ref_end = fd->refs->ref_id[ref_id]->length; | |
1932 } | |
1933 pthread_mutex_unlock(&fd->refs->lock); | |
1934 pthread_mutex_unlock(&fd->ref_lock); | |
1935 } | |
1936 } | |
1937 | |
1938 if ((fd->required_fields & SAM_SEQ) && | |
1939 s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) { | |
1940 fprintf(stderr, "Unable to fetch reference #%d %d..%d\n", | |
1941 s->hdr->ref_seq_id, s->hdr->ref_seq_start, | |
1942 s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); | |
1943 return -1; | |
1944 } | |
1945 | |
1946 if (CRAM_MAJOR_VERS(fd->version) != 1 | |
1947 && (fd->required_fields & SAM_SEQ) | |
1948 && s->hdr->ref_seq_id >= 0 | |
1949 && !fd->ignore_md5 | |
1950 && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) { | |
1951 MD5_CTX md5; | |
1952 unsigned char digest[16]; | |
1953 | |
1954 if (s->ref && s->hdr->ref_seq_id >= 0) { | |
1955 int start, len; | |
1956 | |
1957 if (s->hdr->ref_seq_start >= s->ref_start) { | |
1958 start = s->hdr->ref_seq_start - s->ref_start; | |
1959 } else { | |
1960 fprintf(stderr, "Slice starts before base 1.\n"); | |
1961 start = 0; | |
1962 } | |
1963 | |
1964 if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) { | |
1965 len = s->hdr->ref_seq_span; | |
1966 } else { | |
1967 fprintf(stderr, "Slice ends beyond reference end.\n"); | |
1968 len = s->ref_end - s->ref_start + 1; | |
1969 } | |
1970 | |
1971 MD5_Init(&md5); | |
1972 if (start + len > s->ref_end - s->ref_start + 1) | |
1973 len = s->ref_end - s->ref_start + 1 - start; | |
1974 if (len >= 0) | |
1975 MD5_Update(&md5, s->ref + start, len); | |
1976 MD5_Final(digest, &md5); | |
1977 } else if (!s->ref && s->hdr->ref_base_id >= 0) { | |
1978 cram_block *b; | |
1979 if (s->block_by_id && (b = s->block_by_id[s->hdr->ref_base_id])) { | |
1980 MD5_Init(&md5); | |
1981 MD5_Update(&md5, b->data, b->uncomp_size); | |
1982 MD5_Final(digest, &md5); | |
1983 } | |
1984 } | |
1985 | |
1986 if ((!s->ref && s->hdr->ref_base_id < 0) | |
1987 || memcmp(digest, s->hdr->md5, 16) != 0) { | |
1988 char M[33]; | |
1989 fprintf(stderr, "ERROR: md5sum reference mismatch for ref " | |
1990 "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end); | |
1991 fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M)); | |
1992 fprintf(stderr, "Ref : %s\n", md5_print(digest, M)); | |
1993 return -1; | |
1994 } | |
1995 } | |
1996 | |
1997 if (ref_id == -2) { | |
1998 pthread_mutex_lock(&fd->ref_lock); | |
1999 pthread_mutex_lock(&fd->refs->lock); | |
2000 refs = calloc(fd->refs->nref, sizeof(char *)); | |
2001 pthread_mutex_unlock(&fd->refs->lock); | |
2002 pthread_mutex_unlock(&fd->ref_lock); | |
2003 if (!refs) | |
2004 return -1; | |
2005 } | |
2006 | |
2007 for (rec = 0; rec < s->hdr->num_records; rec++) { | |
2008 cram_record *cr = &s->crecs[rec]; | |
2009 | |
2010 //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit); | |
2011 | |
2012 cr->s = s; | |
2013 | |
2014 out_sz = 1; /* decode 1 item */ | |
2015 if (ds & CRAM_BF) { | |
2016 if (!c->comp_hdr->codecs[DS_BF]) return -1; | |
2017 r |= c->comp_hdr->codecs[DS_BF] | |
2018 ->decode(s, c->comp_hdr->codecs[DS_BF], blk, | |
2019 (char *)&bf, &out_sz); | |
2020 if (bf < 0 || | |
2021 bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap)) | |
2022 return -1; | |
2023 bf = fd->bam_flag_swap[bf]; | |
2024 cr->flags = bf; | |
2025 } else { | |
2026 cr->flags = bf = 0x4; // unmapped | |
2027 } | |
2028 | |
2029 if (ds & CRAM_CF) { | |
2030 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
2031 /* CF is byte in 1.0, int32 in 2.0 */ | |
2032 if (!c->comp_hdr->codecs[DS_CF]) return -1; | |
2033 r |= c->comp_hdr->codecs[DS_CF] | |
2034 ->decode(s, c->comp_hdr->codecs[DS_CF], blk, | |
2035 (char *)&cf, &out_sz); | |
2036 cr->cram_flags = cf; | |
2037 } else { | |
2038 if (!c->comp_hdr->codecs[DS_CF]) return -1; | |
2039 r |= c->comp_hdr->codecs[DS_CF] | |
2040 ->decode(s, c->comp_hdr->codecs[DS_CF], blk, | |
2041 (char *)&cr->cram_flags, | |
2042 &out_sz); | |
2043 cf = cr->cram_flags; | |
2044 } | |
2045 } | |
2046 | |
2047 if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) { | |
2048 if (ds & CRAM_RI) { | |
2049 if (!c->comp_hdr->codecs[DS_RI]) return -1; | |
2050 r |= c->comp_hdr->codecs[DS_RI] | |
2051 ->decode(s, c->comp_hdr->codecs[DS_RI], blk, | |
2052 (char *)&cr->ref_id, &out_sz); | |
2053 if ((fd->required_fields & (SAM_SEQ|SAM_TLEN)) | |
2054 && cr->ref_id >= 0) { | |
2055 if (!fd->no_ref) { | |
2056 if (!refs[cr->ref_id]) | |
2057 refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, | |
2058 1, 0); | |
2059 s->ref = refs[cr->ref_id]; | |
2060 } | |
2061 s->ref_start = 1; | |
2062 pthread_mutex_lock(&fd->ref_lock); | |
2063 pthread_mutex_lock(&fd->refs->lock); | |
2064 s->ref_end = fd->refs->ref_id[cr->ref_id]->length; | |
2065 pthread_mutex_unlock(&fd->refs->lock); | |
2066 pthread_mutex_unlock(&fd->ref_lock); | |
2067 } | |
2068 } else { | |
2069 cr->ref_id = 0; | |
2070 } | |
2071 } else { | |
2072 cr->ref_id = ref_id; // Forced constant in CRAM 1.0 | |
2073 } | |
2074 | |
2075 | |
2076 if (ds & CRAM_RL) { | |
2077 if (!c->comp_hdr->codecs[DS_RL]) return -1; | |
2078 r |= c->comp_hdr->codecs[DS_RL] | |
2079 ->decode(s, c->comp_hdr->codecs[DS_RL], blk, | |
2080 (char *)&cr->len, &out_sz); | |
2081 } | |
2082 | |
2083 if (ds & CRAM_AP) { | |
2084 if (!c->comp_hdr->codecs[DS_AP]) return -1; | |
2085 r |= c->comp_hdr->codecs[DS_AP] | |
2086 ->decode(s, c->comp_hdr->codecs[DS_AP], blk, | |
2087 (char *)&cr->apos, &out_sz); | |
2088 if (c->comp_hdr->AP_delta) | |
2089 cr->apos += s->last_apos; | |
2090 s->last_apos= cr->apos; | |
2091 } else { | |
2092 cr->apos = c->ref_seq_start; | |
2093 } | |
2094 | |
2095 if (ds & CRAM_RG) { | |
2096 if (!c->comp_hdr->codecs[DS_RG]) return -1; | |
2097 r |= c->comp_hdr->codecs[DS_RG] | |
2098 ->decode(s, c->comp_hdr->codecs[DS_RG], blk, | |
2099 (char *)&cr->rg, &out_sz); | |
2100 if (cr->rg == unknown_rg) | |
2101 cr->rg = -1; | |
2102 } else { | |
2103 cr->rg = -1; | |
2104 } | |
2105 | |
2106 cr->name_len = 0; | |
2107 | |
2108 if (c->comp_hdr->read_names_included) { | |
2109 int32_t out_sz2 = 1; | |
2110 | |
2111 // Read directly into name cram_block | |
2112 cr->name = BLOCK_SIZE(s->name_blk); | |
2113 if (ds & CRAM_RN) { | |
2114 if (!c->comp_hdr->codecs[DS_RN]) return -1; | |
2115 r |= c->comp_hdr->codecs[DS_RN] | |
2116 ->decode(s, c->comp_hdr->codecs[DS_RN], blk, | |
2117 (char *)s->name_blk, &out_sz2); | |
2118 cr->name_len = out_sz2; | |
2119 } | |
2120 } | |
2121 | |
2122 cr->mate_pos = 0; | |
2123 cr->mate_line = -1; | |
2124 cr->mate_ref_id = -1; | |
2125 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) { | |
2126 if (ds & CRAM_MF) { | |
2127 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
2128 /* MF is byte in 1.0, int32 in 2.0 */ | |
2129 unsigned char mf; | |
2130 if (!c->comp_hdr->codecs[DS_MF]) return -1; | |
2131 r |= c->comp_hdr->codecs[DS_MF] | |
2132 ->decode(s, c->comp_hdr->codecs[DS_MF], | |
2133 blk, (char *)&mf, &out_sz); | |
2134 cr->mate_flags = mf; | |
2135 } else { | |
2136 if (!c->comp_hdr->codecs[DS_MF]) return -1; | |
2137 r |= c->comp_hdr->codecs[DS_MF] | |
2138 ->decode(s, c->comp_hdr->codecs[DS_MF], | |
2139 blk, | |
2140 (char *)&cr->mate_flags, | |
2141 &out_sz); | |
2142 } | |
2143 } else { | |
2144 cr->mate_flags = 0; | |
2145 } | |
2146 | |
2147 if (!c->comp_hdr->read_names_included) { | |
2148 int32_t out_sz2 = 1; | |
2149 | |
2150 // Read directly into name cram_block | |
2151 cr->name = BLOCK_SIZE(s->name_blk); | |
2152 if (ds & CRAM_RN) { | |
2153 if (!c->comp_hdr->codecs[DS_RN]) return -1; | |
2154 r |= c->comp_hdr->codecs[DS_RN] | |
2155 ->decode(s, c->comp_hdr->codecs[DS_RN], | |
2156 blk, (char *)s->name_blk, | |
2157 &out_sz2); | |
2158 cr->name_len = out_sz2; | |
2159 } | |
2160 } | |
2161 | |
2162 if (ds & CRAM_NS) { | |
2163 if (!c->comp_hdr->codecs[DS_NS]) return -1; | |
2164 r |= c->comp_hdr->codecs[DS_NS] | |
2165 ->decode(s, c->comp_hdr->codecs[DS_NS], blk, | |
2166 (char *)&cr->mate_ref_id, &out_sz); | |
2167 } | |
2168 | |
2169 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown. | |
2170 // if (cr->mate_ref_id == -1 && cr->flags & 0x01) { | |
2171 // /* Paired, but unmapped */ | |
2172 // cr->flags |= BAM_FMUNMAP; | |
2173 // } | |
2174 | |
2175 if (ds & CRAM_NP) { | |
2176 if (!c->comp_hdr->codecs[DS_NP]) return -1; | |
2177 r |= c->comp_hdr->codecs[DS_NP] | |
2178 ->decode(s, c->comp_hdr->codecs[DS_NP], blk, | |
2179 (char *)&cr->mate_pos, &out_sz); | |
2180 } | |
2181 | |
2182 if (ds & CRAM_TS) { | |
2183 if (!c->comp_hdr->codecs[DS_TS]) return -1; | |
2184 r |= c->comp_hdr->codecs[DS_TS] | |
2185 ->decode(s, c->comp_hdr->codecs[DS_TS], blk, | |
2186 (char *)&cr->tlen, &out_sz); | |
2187 } else { | |
2188 cr->tlen = INT_MIN; | |
2189 } | |
2190 } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) { | |
2191 if (ds & CRAM_NF) { | |
2192 if (!c->comp_hdr->codecs[DS_NF]) return -1; | |
2193 r |= c->comp_hdr->codecs[DS_NF] | |
2194 ->decode(s, c->comp_hdr->codecs[DS_NF], blk, | |
2195 (char *)&cr->mate_line, &out_sz); | |
2196 cr->mate_line += rec + 1; | |
2197 | |
2198 //cr->name_len = sprintf(name, "%d", name_id++); | |
2199 //cr->name = DSTRING_LEN(name_ds); | |
2200 //dstring_nappend(name_ds, name, cr->name_len); | |
2201 | |
2202 cr->mate_ref_id = -1; | |
2203 cr->tlen = INT_MIN; | |
2204 cr->mate_pos = 0; | |
2205 } else { | |
2206 cr->mate_flags = 0; | |
2207 cr->tlen = INT_MIN; | |
2208 } | |
2209 } else { | |
2210 cr->mate_flags = 0; | |
2211 cr->tlen = INT_MIN; | |
2212 } | |
2213 /* | |
2214 else if (!name[0]) { | |
2215 //name[0] = '?'; name[1] = 0; | |
2216 //cr->name_len = 1; | |
2217 //cr->name= DSTRING_LEN(s->name_ds); | |
2218 //dstring_nappend(s->name_ds, "?", 1); | |
2219 | |
2220 cr->mate_ref_id = -1; | |
2221 cr->tlen = 0; | |
2222 cr->mate_pos = 0; | |
2223 } | |
2224 */ | |
2225 | |
2226 /* Auxiliary tags */ | |
2227 if (CRAM_MAJOR_VERS(fd->version) == 1) | |
2228 r |= cram_decode_aux_1_0(c, s, blk, cr); | |
2229 else | |
2230 r |= cram_decode_aux(c, s, blk, cr); | |
2231 | |
2232 /* Fake up dynamic string growth and appending */ | |
2233 if (ds & CRAM_RL) { | |
2234 cr->seq = BLOCK_SIZE(s->seqs_blk); | |
2235 BLOCK_GROW(s->seqs_blk, cr->len); | |
2236 seq = (char *)BLOCK_END(s->seqs_blk); | |
2237 BLOCK_SIZE(s->seqs_blk) += cr->len; | |
2238 | |
2239 if (!seq) | |
2240 return -1; | |
2241 | |
2242 cr->qual = BLOCK_SIZE(s->qual_blk); | |
2243 BLOCK_GROW(s->qual_blk, cr->len); | |
2244 qual = (char *)BLOCK_END(s->qual_blk); | |
2245 BLOCK_SIZE(s->qual_blk) += cr->len; | |
2246 | |
2247 if (!s->ref) | |
2248 memset(seq, '=', cr->len); | |
2249 } | |
2250 | |
2251 if (!(bf & BAM_FUNMAP)) { | |
2252 /* Decode sequence and generate CIGAR */ | |
2253 if (ds & (CRAM_SEQ | CRAM_MQ)) { | |
2254 r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual); | |
2255 } else { | |
2256 cr->cigar = 0; | |
2257 cr->ncigar = 0; | |
2258 cr->aend = cr->apos; | |
2259 cr->mqual = 0; | |
2260 } | |
2261 } else { | |
2262 int out_sz2 = cr->len; | |
2263 | |
2264 //puts("Unmapped"); | |
2265 cr->cigar = 0; | |
2266 cr->ncigar = 0; | |
2267 cr->aend = cr->apos; | |
2268 cr->mqual = 0; | |
2269 | |
2270 if (ds & CRAM_BA) { | |
2271 if (!c->comp_hdr->codecs[DS_BA]) return -1; | |
2272 r |= c->comp_hdr->codecs[DS_BA] | |
2273 ->decode(s, c->comp_hdr->codecs[DS_BA], blk, | |
2274 (char *)seq, &out_sz2); | |
2275 } | |
2276 | |
2277 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { | |
2278 out_sz2 = cr->len; | |
2279 if (ds & CRAM_QS) { | |
2280 if (!c->comp_hdr->codecs[DS_QS]) return -1; | |
2281 r |= c->comp_hdr->codecs[DS_QS] | |
2282 ->decode(s, c->comp_hdr->codecs[DS_QS], | |
2283 blk, qual, &out_sz2); | |
2284 } | |
2285 } else { | |
2286 if (ds & CRAM_RL) | |
2287 memset(qual, 30, cr->len); | |
2288 } | |
2289 } | |
2290 } | |
2291 | |
2292 pthread_mutex_lock(&fd->ref_lock); | |
2293 if (refs) { | |
2294 int i; | |
2295 for (i = 0; i < fd->refs->nref; i++) { | |
2296 if (refs[i]) | |
2297 cram_ref_decr(fd->refs, i); | |
2298 } | |
2299 free(refs); | |
2300 } else if (ref_id >= 0 && s->ref != fd->ref_free) { | |
2301 cram_ref_decr(fd->refs, ref_id); | |
2302 } | |
2303 pthread_mutex_unlock(&fd->ref_lock); | |
2304 | |
2305 /* Resolve mate pair cross-references between recs within this slice */ | |
2306 cram_decode_slice_xref(s, fd->required_fields); | |
2307 | |
2308 return r; | |
2309 } | |
2310 | |
2311 typedef struct { | |
2312 cram_fd *fd; | |
2313 cram_container *c; | |
2314 cram_slice *s; | |
2315 SAM_hdr *h; | |
2316 int exit_code; | |
2317 } cram_decode_job; | |
2318 | |
2319 void *cram_decode_slice_thread(void *arg) { | |
2320 cram_decode_job *j = (cram_decode_job *)arg; | |
2321 | |
2322 j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h); | |
2323 | |
2324 return j; | |
2325 } | |
2326 | |
2327 /* | |
2328 * Spawn a multi-threaded version of cram_decode_slice(). | |
2329 */ | |
2330 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, | |
2331 SAM_hdr *bfd) { | |
2332 cram_decode_job *j; | |
2333 int nonblock; | |
2334 | |
2335 if (!fd->pool) | |
2336 return cram_decode_slice(fd, c, s, bfd); | |
2337 | |
2338 if (!(j = malloc(sizeof(*j)))) | |
2339 return -1; | |
2340 | |
2341 j->fd = fd; | |
2342 j->c = c; | |
2343 j->s = s; | |
2344 j->h = bfd; | |
2345 | |
2346 nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0; | |
2347 | |
2348 if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread, | |
2349 j, nonblock)) { | |
2350 /* Would block */ | |
2351 fd->job_pending = j; | |
2352 } else { | |
2353 fd->job_pending = NULL; | |
2354 } | |
2355 | |
2356 // flush too | |
2357 return 0; | |
2358 } | |
2359 | |
2360 | |
2361 /* ---------------------------------------------------------------------- | |
2362 * CRAM sequence iterators. | |
2363 */ | |
2364 | |
2365 /* | |
2366 * Converts a cram in-memory record into a bam in-memory record. We | |
2367 * pass a pointer to a bam_seq_t pointer along with the a pointer to | |
2368 * the allocated size. These can initially be pointers to NULL and zero. | |
2369 * | |
2370 * This function will reallocate the bam buffer as required and update | |
2371 * (*bam)->alloc accordingly, allowing it to be used within a loop | |
2372 * efficiently without needing to allocate new bam objects over and | |
2373 * over again. | |
2374 * | |
2375 * Returns the used size of the bam record on success | |
2376 * -1 on failure. | |
2377 */ | |
2378 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s, | |
2379 cram_record *cr, int rec, bam_seq_t **bam) { | |
2380 int bam_idx, rg_len; | |
2381 char name_a[1024], *name; | |
2382 int name_len; | |
2383 char *aux, *aux_orig; | |
2384 char *seq, *qual; | |
2385 | |
2386 /* Assign names if not explicitly set */ | |
2387 if (fd->required_fields & SAM_QNAME) { | |
2388 if (cr->name_len) { | |
2389 name = (char *)BLOCK_DATA(s->name_blk) + cr->name; | |
2390 name_len = cr->name_len; | |
2391 } else { | |
2392 name = name_a; | |
2393 name_len = strlen(fd->prefix); | |
2394 memcpy(name, fd->prefix, name_len); | |
2395 name += name_len; | |
2396 *name++ = ':'; | |
2397 if (cr->mate_line >= 0 && cr->mate_line < rec) | |
2398 name = (char *)append_uint64((unsigned char *)name, | |
2399 s->hdr->record_counter + | |
2400 cr->mate_line + 1); | |
2401 else | |
2402 name = (char *)append_uint64((unsigned char *)name, | |
2403 s->hdr->record_counter + | |
2404 rec + 1); | |
2405 name_len = name - name_a; | |
2406 name = name_a; | |
2407 } | |
2408 } else { | |
2409 name = "?"; | |
2410 name_len = 1; | |
2411 } | |
2412 | |
2413 /* Generate BAM record */ | |
2414 if (cr->rg < -1 || cr->rg >= bfd->nrg) | |
2415 return -1; | |
2416 rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0; | |
2417 | |
2418 if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) { | |
2419 if (!BLOCK_DATA(s->seqs_blk)) | |
2420 return -1; | |
2421 seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; | |
2422 } else { | |
2423 seq = "*"; | |
2424 cr->len = 1; | |
2425 } | |
2426 | |
2427 | |
2428 if (fd->required_fields & SAM_QUAL) { | |
2429 if (!BLOCK_DATA(s->qual_blk)) | |
2430 return -1; | |
2431 qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual; | |
2432 } else { | |
2433 qual = NULL; | |
2434 } | |
2435 | |
2436 bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len, | |
2437 name, name_len, | |
2438 cr->flags, | |
2439 cr->ref_id, | |
2440 cr->apos, | |
2441 cr->aend, | |
2442 cr->mqual, | |
2443 cr->ncigar, &s->cigar[cr->cigar], | |
2444 cr->mate_ref_id, | |
2445 cr->mate_pos, | |
2446 cr->tlen, | |
2447 cr->len, | |
2448 seq, | |
2449 qual); | |
2450 if (bam_idx == -1) | |
2451 return -1; | |
2452 | |
2453 aux = aux_orig = (char *)bam_aux(*bam); | |
2454 | |
2455 /* Auxiliary strings */ | |
2456 if (cr->aux_size != 0) { | |
2457 memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size); | |
2458 aux += cr->aux_size; | |
2459 } | |
2460 | |
2461 /* RG:Z: */ | |
2462 if (cr->rg != -1) { | |
2463 int len = bfd->rg[cr->rg].name_len; | |
2464 *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z'; | |
2465 memcpy(aux, bfd->rg[cr->rg].name, len); | |
2466 aux += len; | |
2467 *aux++ = 0; | |
2468 } | |
2469 | |
2470 return bam_idx + (aux - aux_orig); | |
2471 } | |
2472 | |
2473 /* | |
2474 * Here be dragons! The multi-threading code in this is crufty beyond belief. | |
2475 */ | |
2476 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { | |
2477 cram_container *c; | |
2478 cram_slice *s = NULL; | |
2479 | |
2480 if (!(c = fd->ctr)) { | |
2481 // Load first container. | |
2482 do { | |
2483 if (!(c = fd->ctr = cram_read_container(fd))) | |
2484 return NULL; | |
2485 } while (c->length == 0); | |
2486 | |
2487 /* | |
2488 * The first container may be a result of a sub-range query. | |
2489 * In which case it may still not be the optimal starting point | |
2490 * due to skipped containers/slices in the index. | |
2491 */ | |
2492 if (fd->range.refid != -2) { | |
2493 while (c->ref_seq_id != -2 && | |
2494 (c->ref_seq_id < fd->range.refid || | |
2495 c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) { | |
2496 if (0 != cram_seek(fd, c->length, SEEK_CUR)) | |
2497 return NULL; | |
2498 cram_free_container(fd->ctr); | |
2499 do { | |
2500 if (!(c = fd->ctr = cram_read_container(fd))) | |
2501 return NULL; | |
2502 } while (c->length == 0); | |
2503 } | |
2504 | |
2505 if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) | |
2506 return NULL; | |
2507 } | |
2508 | |
2509 if (!(c->comp_hdr_block = cram_read_block(fd))) | |
2510 return NULL; | |
2511 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER) | |
2512 return NULL; | |
2513 | |
2514 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); | |
2515 if (!c->comp_hdr) | |
2516 return NULL; | |
2517 if (!c->comp_hdr->AP_delta) { | |
2518 pthread_mutex_lock(&fd->ref_lock); | |
2519 fd->unsorted = 1; | |
2520 pthread_mutex_unlock(&fd->ref_lock); | |
2521 } | |
2522 } | |
2523 | |
2524 if ((s = c->slice)) { | |
2525 c->slice = NULL; | |
2526 cram_free_slice(s); | |
2527 s = NULL; | |
2528 } | |
2529 | |
2530 if (c->curr_slice == c->max_slice) { | |
2531 cram_free_container(c); | |
2532 c = NULL; | |
2533 } | |
2534 | |
2535 /* Sorry this is so contorted! */ | |
2536 for (;;) { | |
2537 if (fd->job_pending) { | |
2538 cram_decode_job *j = (cram_decode_job *)fd->job_pending; | |
2539 c = j->c; | |
2540 s = j->s; | |
2541 free(fd->job_pending); | |
2542 fd->job_pending = NULL; | |
2543 } else if (!fd->ooc) { | |
2544 empty_container: | |
2545 if (!c || c->curr_slice == c->max_slice) { | |
2546 // new container | |
2547 do { | |
2548 if (!(c = fd->ctr = cram_read_container(fd))) { | |
2549 if (fd->pool) { | |
2550 fd->ooc = 1; | |
2551 break; | |
2552 } | |
2553 | |
2554 return NULL; | |
2555 } | |
2556 } while (c->length == 0); | |
2557 if (fd->ooc) | |
2558 break; | |
2559 | |
2560 /* Skip containers not yet spanning our range */ | |
2561 if (fd->range.refid != -2 && c->ref_seq_id != -2) { | |
2562 fd->required_fields |= SAM_POS; | |
2563 | |
2564 if (c->ref_seq_id != fd->range.refid) { | |
2565 cram_free_container(c); | |
2566 fd->ctr = NULL; | |
2567 fd->ooc = 1; | |
2568 fd->eof = 1; | |
2569 break; | |
2570 } | |
2571 | |
2572 if (c->ref_seq_start > fd->range.end) { | |
2573 cram_free_container(c); | |
2574 fd->ctr = NULL; | |
2575 fd->ooc = 1; | |
2576 fd->eof = 1; | |
2577 break; | |
2578 } | |
2579 | |
2580 if (c->ref_seq_start + c->ref_seq_span-1 < | |
2581 fd->range.start) { | |
2582 c->curr_rec = c->max_rec; | |
2583 c->curr_slice = c->max_slice; | |
2584 cram_seek(fd, c->length, SEEK_CUR); | |
2585 cram_free_container(c); | |
2586 c = NULL; | |
2587 continue; | |
2588 } | |
2589 } | |
2590 | |
2591 if (!(c->comp_hdr_block = cram_read_block(fd))) | |
2592 return NULL; | |
2593 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER) | |
2594 return NULL; | |
2595 | |
2596 c->comp_hdr = | |
2597 cram_decode_compression_header(fd, c->comp_hdr_block); | |
2598 if (!c->comp_hdr) | |
2599 return NULL; | |
2600 | |
2601 if (!c->comp_hdr->AP_delta) { | |
2602 pthread_mutex_lock(&fd->ref_lock); | |
2603 fd->unsorted = 1; | |
2604 pthread_mutex_unlock(&fd->ref_lock); | |
2605 } | |
2606 } | |
2607 | |
2608 if (c->num_records == 0) { | |
2609 cram_free_container(c); c = NULL; | |
2610 goto empty_container; | |
2611 } | |
2612 | |
2613 | |
2614 if (!(s = c->slice = cram_read_slice(fd))) | |
2615 return NULL; | |
2616 c->curr_slice++; | |
2617 c->curr_rec = 0; | |
2618 c->max_rec = s->hdr->num_records; | |
2619 | |
2620 s->last_apos = s->hdr->ref_seq_start; | |
2621 | |
2622 /* Skip slices not yet spanning our range */ | |
2623 if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) { | |
2624 if (s->hdr->ref_seq_id != fd->range.refid) { | |
2625 fd->eof = 1; | |
2626 cram_free_slice(s); | |
2627 c->slice = NULL; | |
2628 return NULL; | |
2629 } | |
2630 | |
2631 if (s->hdr->ref_seq_start > fd->range.end) { | |
2632 fd->eof = 1; | |
2633 cram_free_slice(s); | |
2634 c->slice = NULL; | |
2635 return NULL; | |
2636 } | |
2637 | |
2638 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 < | |
2639 fd->range.start) { | |
2640 cram_free_slice(s); | |
2641 c->slice = NULL; | |
2642 cram_free_container(c); | |
2643 c = NULL; | |
2644 continue; | |
2645 } | |
2646 } | |
2647 } | |
2648 | |
2649 /* Test decoding of 1st seq */ | |
2650 if (!c || !s) | |
2651 break; | |
2652 | |
2653 if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) { | |
2654 // if (cram_decode_slice(fd, c, s, fd->header) != 0) { | |
2655 fprintf(stderr, "Failure to decode slice\n"); | |
2656 cram_free_slice(s); | |
2657 c->slice = NULL; | |
2658 return NULL; | |
2659 } | |
2660 | |
2661 if (!fd->pool || fd->job_pending) | |
2662 break; | |
2663 | |
2664 // Push it a bit far, to qsize in queue rather than pending arrival, | |
2665 // as cram tends to be a bit bursty in decode timings. | |
2666 if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize) | |
2667 break; | |
2668 } | |
2669 | |
2670 if (fd->pool) { | |
2671 t_pool_result *res; | |
2672 cram_decode_job *j; | |
2673 | |
2674 // fprintf(stderr, "Thread pool len = %d, %d\n", | |
2675 // t_pool_results_queue_len(fd->rqueue), | |
2676 // t_pool_results_queue_sz(fd->rqueue)); | |
2677 | |
2678 if (fd->ooc && t_pool_results_queue_empty(fd->rqueue)) | |
2679 return NULL; | |
2680 | |
2681 res = t_pool_next_result_wait(fd->rqueue); | |
2682 | |
2683 if (!res || !res->data) { | |
2684 fprintf(stderr, "t_pool_next_result failure\n"); | |
2685 return NULL; | |
2686 } | |
2687 | |
2688 j = (cram_decode_job *)res->data; | |
2689 c = j->c; | |
2690 s = j->s; | |
2691 | |
2692 fd->ctr = c; | |
2693 | |
2694 t_pool_delete_result(res, 1); | |
2695 } | |
2696 | |
2697 *cp = c; | |
2698 return s; | |
2699 } | |
2700 | |
2701 /* | |
2702 * Read the next cram record and return it. | |
2703 * Note that to decode cram_record the caller will need to look up some data | |
2704 * in the current slice, pointed to by fd->ctr->slice. This is valid until | |
2705 * the next call to cram_get_seq (which may invalidate it). | |
2706 * | |
2707 * Returns record pointer on success (do not free) | |
2708 * NULL on failure | |
2709 */ | |
2710 cram_record *cram_get_seq(cram_fd *fd) { | |
2711 cram_container *c; | |
2712 cram_slice *s; | |
2713 | |
2714 for (;;) { | |
2715 c = fd->ctr; | |
2716 if (c && c->slice && c->curr_rec < c->max_rec) { | |
2717 s = c->slice; | |
2718 } else { | |
2719 if (!(s = cram_next_slice(fd, &c))) | |
2720 return NULL; | |
2721 } | |
2722 | |
2723 if (fd->range.refid != -2) { | |
2724 if (s->crecs[c->curr_rec].ref_id < fd->range.refid) { | |
2725 c->curr_rec++; | |
2726 continue; | |
2727 } | |
2728 | |
2729 if (s->crecs[c->curr_rec].ref_id != fd->range.refid) { | |
2730 fd->eof = 1; | |
2731 cram_free_slice(s); | |
2732 c->slice = NULL; | |
2733 return NULL; | |
2734 } | |
2735 | |
2736 if (s->crecs[c->curr_rec].apos > fd->range.end) { | |
2737 fd->eof = 1; | |
2738 cram_free_slice(s); | |
2739 c->slice = NULL; | |
2740 return NULL; | |
2741 } | |
2742 | |
2743 if (s->crecs[c->curr_rec].aend < fd->range.start) { | |
2744 c->curr_rec++; | |
2745 continue; | |
2746 } | |
2747 } | |
2748 | |
2749 break; | |
2750 } | |
2751 | |
2752 fd->ctr = c; | |
2753 c->slice = s; | |
2754 return &s->crecs[c->curr_rec++]; | |
2755 } | |
2756 | |
2757 /* | |
2758 * Read the next cram record and convert it to a bam_seq_t struct. | |
2759 * | |
2760 * Returns 0 on success | |
2761 * -1 on EOF or failure (check fd->err) | |
2762 */ | |
2763 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) { | |
2764 cram_record *cr; | |
2765 cram_container *c; | |
2766 cram_slice *s; | |
2767 | |
2768 if (!(cr = cram_get_seq(fd))) | |
2769 return -1; | |
2770 | |
2771 c = fd->ctr; | |
2772 s = c->slice; | |
2773 | |
2774 return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam); | |
2775 } |