Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_encode.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* | |
| 2 Copyright (c) 2012-2013 Genome Research Ltd. | |
| 3 Author: James Bonfield <jkb@sanger.ac.uk> | |
| 4 | |
| 5 Redistribution and use in source and binary forms, with or without | |
| 6 modification, are permitted provided that the following conditions are met: | |
| 7 | |
| 8 1. Redistributions of source code must retain the above copyright notice, | |
| 9 this list of conditions and the following disclaimer. | |
| 10 | |
| 11 2. Redistributions in binary form must reproduce the above copyright notice, | |
| 12 this list of conditions and the following disclaimer in the documentation | |
| 13 and/or other materials provided with the distribution. | |
| 14 | |
| 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
| 16 Institute nor the names of its contributors may be used to endorse or promote | |
| 17 products derived from this software without specific prior written permission. | |
| 18 | |
| 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
| 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
| 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 #ifdef HAVE_CONFIG_H | |
| 32 #include "io_lib_config.h" | |
| 33 #endif | |
| 34 | |
| 35 #include <stdio.h> | |
| 36 #include <errno.h> | |
| 37 #include <assert.h> | |
| 38 #include <stdlib.h> | |
| 39 #include <string.h> | |
| 40 #include <zlib.h> | |
| 41 #include <sys/types.h> | |
| 42 #include <sys/stat.h> | |
| 43 #include <math.h> | |
| 44 #include <ctype.h> | |
| 45 | |
| 46 #include "cram/cram.h" | |
| 47 #include "cram/os.h" | |
| 48 #include "cram/md5.h" | |
| 49 | |
| 50 #define Z_CRAM_STRAT Z_FILTERED | |
| 51 //#define Z_CRAM_STRAT Z_RLE | |
| 52 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY | |
| 53 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY | |
| 54 | |
| 55 static int process_one_read(cram_fd *fd, cram_container *c, | |
| 56 cram_slice *s, cram_record *cr, | |
| 57 bam_seq_t *b, int rnum); | |
| 58 | |
| 59 /* | |
| 60 * Returns index of val into key. | |
| 61 * Basically strchr(key, val)-key; | |
| 62 */ | |
| 63 static int sub_idx(char *key, char val) { | |
| 64 int i; | |
| 65 | |
| 66 for (i = 0; *key && *key++ != val; i++); | |
| 67 return i; | |
| 68 } | |
| 69 | |
| 70 /* | |
| 71 * Encodes a compression header block into a generic cram_block structure. | |
| 72 * | |
| 73 * Returns cram_block ptr on success | |
| 74 * NULL on failure | |
| 75 */ | |
| 76 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, | |
| 77 cram_block_compression_hdr *h) { | |
| 78 cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); | |
| 79 cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); | |
| 80 int i, mc; | |
| 81 | |
| 82 if (!cb || !map) | |
| 83 return NULL; | |
| 84 | |
| 85 /* | |
| 86 * This is a concatenation of several blocks of data: | |
| 87 * header + landmarks, preservation map, read encoding map, and the tag | |
| 88 * encoding map. | |
| 89 * All 4 are variable sized and we need to know how large these are | |
| 90 * before creating the compression header itself as this starts with | |
| 91 * the total size (stored as a variable length string). | |
| 92 */ | |
| 93 | |
| 94 // Duplicated from container itself, and removed in 1.1 | |
| 95 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 96 itf8_put_blk(cb, h->ref_seq_id); | |
| 97 itf8_put_blk(cb, h->ref_seq_start); | |
| 98 itf8_put_blk(cb, h->ref_seq_span); | |
| 99 itf8_put_blk(cb, h->num_records); | |
| 100 itf8_put_blk(cb, h->num_landmarks); | |
| 101 for (i = 0; i < h->num_landmarks; i++) { | |
| 102 itf8_put_blk(cb, h->landmark[i]); | |
| 103 } | |
| 104 } | |
| 105 | |
| 106 /* Create in-memory preservation map */ | |
| 107 /* FIXME: should create this when we create the container */ | |
| 108 { | |
| 109 khint_t k; | |
| 110 int r; | |
| 111 | |
| 112 if (!(h->preservation_map = kh_init(map))) | |
| 113 return NULL; | |
| 114 | |
| 115 k = kh_put(map, h->preservation_map, "RN", &r); | |
| 116 if (-1 == r) return NULL; | |
| 117 kh_val(h->preservation_map, k).i = 1; | |
| 118 | |
| 119 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 120 k = kh_put(map, h->preservation_map, "PI", &r); | |
| 121 if (-1 == r) return NULL; | |
| 122 kh_val(h->preservation_map, k).i = 0; | |
| 123 | |
| 124 k = kh_put(map, h->preservation_map, "UI", &r); | |
| 125 if (-1 == r) return NULL; | |
| 126 kh_val(h->preservation_map, k).i = 1; | |
| 127 | |
| 128 k = kh_put(map, h->preservation_map, "MI", &r); | |
| 129 if (-1 == r) return NULL; | |
| 130 kh_val(h->preservation_map, k).i = 1; | |
| 131 | |
| 132 } else { | |
| 133 // Technically SM was in 1.0, but wasn't in Java impl. | |
| 134 k = kh_put(map, h->preservation_map, "SM", &r); | |
| 135 if (-1 == r) return NULL; | |
| 136 kh_val(h->preservation_map, k).i = 0; | |
| 137 | |
| 138 k = kh_put(map, h->preservation_map, "TD", &r); | |
| 139 if (-1 == r) return NULL; | |
| 140 kh_val(h->preservation_map, k).i = 0; | |
| 141 | |
| 142 k = kh_put(map, h->preservation_map, "AP", &r); | |
| 143 if (-1 == r) return NULL; | |
| 144 kh_val(h->preservation_map, k).i = c->pos_sorted; | |
| 145 | |
| 146 if (fd->no_ref || fd->embed_ref) { | |
| 147 // Reference Required == No | |
| 148 k = kh_put(map, h->preservation_map, "RR", &r); | |
| 149 if (-1 == r) return NULL; | |
| 150 kh_val(h->preservation_map, k).i = 0; | |
| 151 } | |
| 152 } | |
| 153 } | |
| 154 | |
| 155 /* Encode preservation map; could collapse this and above into one */ | |
| 156 mc = 0; | |
| 157 BLOCK_SIZE(map) = 0; | |
| 158 if (h->preservation_map) { | |
| 159 khint_t k; | |
| 160 | |
| 161 for (k = kh_begin(h->preservation_map); | |
| 162 k != kh_end(h->preservation_map); | |
| 163 k++) { | |
| 164 const char *key; | |
| 165 khash_t(map) *pmap = h->preservation_map; | |
| 166 | |
| 167 | |
| 168 if (!kh_exist(pmap, k)) | |
| 169 continue; | |
| 170 | |
| 171 key = kh_key(pmap, k); | |
| 172 BLOCK_APPEND(map, key, 2); | |
| 173 | |
| 174 switch(CRAM_KEY(key[0], key[1])) { | |
| 175 case CRAM_KEY('M','I'): | |
| 176 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
| 177 break; | |
| 178 | |
| 179 case CRAM_KEY('U','I'): | |
| 180 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
| 181 break; | |
| 182 | |
| 183 case CRAM_KEY('P','I'): | |
| 184 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
| 185 break; | |
| 186 | |
| 187 case CRAM_KEY('A','P'): | |
| 188 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
| 189 break; | |
| 190 | |
| 191 case CRAM_KEY('R','N'): | |
| 192 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
| 193 break; | |
| 194 | |
| 195 case CRAM_KEY('R','R'): | |
| 196 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
| 197 break; | |
| 198 | |
| 199 case CRAM_KEY('S','M'): { | |
| 200 char smat[5], *mp = smat; | |
| 201 *mp++ = | |
| 202 (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | | |
| 203 (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | | |
| 204 (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | | |
| 205 (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); | |
| 206 *mp++ = | |
| 207 (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | | |
| 208 (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | | |
| 209 (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | | |
| 210 (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); | |
| 211 *mp++ = | |
| 212 (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | | |
| 213 (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | | |
| 214 (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | | |
| 215 (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); | |
| 216 *mp++ = | |
| 217 (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | | |
| 218 (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | | |
| 219 (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | | |
| 220 (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); | |
| 221 *mp++ = | |
| 222 (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | | |
| 223 (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | | |
| 224 (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | | |
| 225 (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); | |
| 226 BLOCK_APPEND(map, smat, 5); | |
| 227 break; | |
| 228 } | |
| 229 | |
| 230 case CRAM_KEY('T','D'): { | |
| 231 itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); | |
| 232 BLOCK_APPEND(map, | |
| 233 BLOCK_DATA(h->TD_blk), | |
| 234 BLOCK_SIZE(h->TD_blk)); | |
| 235 break; | |
| 236 } | |
| 237 | |
| 238 default: | |
| 239 fprintf(stderr, "Unknown preservation key '%.2s'\n", key); | |
| 240 break; | |
| 241 } | |
| 242 | |
| 243 mc++; | |
| 244 } | |
| 245 } | |
| 246 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); | |
| 247 itf8_put_blk(cb, mc); | |
| 248 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); | |
| 249 | |
| 250 /* rec encoding map */ | |
| 251 mc = 0; | |
| 252 BLOCK_SIZE(map) = 0; | |
| 253 if (h->codecs[DS_BF]) { | |
| 254 if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF", | |
| 255 fd->version)) | |
| 256 return NULL; | |
| 257 mc++; | |
| 258 } | |
| 259 if (h->codecs[DS_CF]) { | |
| 260 if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF", | |
| 261 fd->version)) | |
| 262 return NULL; | |
| 263 mc++; | |
| 264 } | |
| 265 if (h->codecs[DS_RL]) { | |
| 266 if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL", | |
| 267 fd->version)) | |
| 268 return NULL; | |
| 269 mc++; | |
| 270 } | |
| 271 if (h->codecs[DS_AP]) { | |
| 272 if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP", | |
| 273 fd->version)) | |
| 274 return NULL; | |
| 275 mc++; | |
| 276 } | |
| 277 if (h->codecs[DS_RG]) { | |
| 278 if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG", | |
| 279 fd->version)) | |
| 280 return NULL; | |
| 281 mc++; | |
| 282 } | |
| 283 if (h->codecs[DS_MF]) { | |
| 284 if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF", | |
| 285 fd->version)) | |
| 286 return NULL; | |
| 287 mc++; | |
| 288 } | |
| 289 if (h->codecs[DS_NS]) { | |
| 290 if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS", | |
| 291 fd->version)) | |
| 292 return NULL; | |
| 293 mc++; | |
| 294 } | |
| 295 if (h->codecs[DS_NP]) { | |
| 296 if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP", | |
| 297 fd->version)) | |
| 298 return NULL; | |
| 299 mc++; | |
| 300 } | |
| 301 if (h->codecs[DS_TS]) { | |
| 302 if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS", | |
| 303 fd->version)) | |
| 304 return NULL; | |
| 305 mc++; | |
| 306 } | |
| 307 if (h->codecs[DS_NF]) { | |
| 308 if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF", | |
| 309 fd->version)) | |
| 310 return NULL; | |
| 311 mc++; | |
| 312 } | |
| 313 if (h->codecs[DS_TC]) { | |
| 314 if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC", | |
| 315 fd->version)) | |
| 316 return NULL; | |
| 317 mc++; | |
| 318 } | |
| 319 if (h->codecs[DS_TN]) { | |
| 320 if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN", | |
| 321 fd->version)) | |
| 322 return NULL; | |
| 323 mc++; | |
| 324 } | |
| 325 if (h->codecs[DS_TL]) { | |
| 326 if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL", | |
| 327 fd->version)) | |
| 328 return NULL; | |
| 329 mc++; | |
| 330 } | |
| 331 if (h->codecs[DS_FN]) { | |
| 332 if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN", | |
| 333 fd->version)) | |
| 334 return NULL; | |
| 335 mc++; | |
| 336 } | |
| 337 if (h->codecs[DS_FC]) { | |
| 338 if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC", | |
| 339 fd->version)) | |
| 340 return NULL; | |
| 341 mc++; | |
| 342 } | |
| 343 if (h->codecs[DS_FP]) { | |
| 344 if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP", | |
| 345 fd->version)) | |
| 346 return NULL; | |
| 347 mc++; | |
| 348 } | |
| 349 if (h->codecs[DS_BS]) { | |
| 350 if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS", | |
| 351 fd->version)) | |
| 352 return NULL; | |
| 353 mc++; | |
| 354 } | |
| 355 if (h->codecs[DS_IN]) { | |
| 356 if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN", | |
| 357 fd->version)) | |
| 358 return NULL; | |
| 359 mc++; | |
| 360 } | |
| 361 if (h->codecs[DS_DL]) { | |
| 362 if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL", | |
| 363 fd->version)) | |
| 364 return NULL; | |
| 365 mc++; | |
| 366 } | |
| 367 if (h->codecs[DS_BA]) { | |
| 368 if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA", | |
| 369 fd->version)) | |
| 370 return NULL; | |
| 371 mc++; | |
| 372 } | |
| 373 if (h->codecs[DS_BB]) { | |
| 374 if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB", | |
| 375 fd->version)) | |
| 376 return NULL; | |
| 377 mc++; | |
| 378 } | |
| 379 if (h->codecs[DS_MQ]) { | |
| 380 if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ", | |
| 381 fd->version)) | |
| 382 return NULL; | |
| 383 mc++; | |
| 384 } | |
| 385 if (h->codecs[DS_RN]) { | |
| 386 if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN", | |
| 387 fd->version)) | |
| 388 return NULL; | |
| 389 mc++; | |
| 390 } | |
| 391 if (h->codecs[DS_QS]) { | |
| 392 if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS", | |
| 393 fd->version)) | |
| 394 return NULL; | |
| 395 mc++; | |
| 396 } | |
| 397 if (h->codecs[DS_QQ]) { | |
| 398 if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ", | |
| 399 fd->version)) | |
| 400 return NULL; | |
| 401 mc++; | |
| 402 } | |
| 403 if (h->codecs[DS_RI]) { | |
| 404 if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI", | |
| 405 fd->version)) | |
| 406 return NULL; | |
| 407 mc++; | |
| 408 } | |
| 409 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
| 410 if (h->codecs[DS_SC]) { | |
| 411 if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC", | |
| 412 fd->version)) | |
| 413 return NULL; | |
| 414 mc++; | |
| 415 } | |
| 416 if (h->codecs[DS_RS]) { | |
| 417 if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS", | |
| 418 fd->version)) | |
| 419 return NULL; | |
| 420 mc++; | |
| 421 } | |
| 422 if (h->codecs[DS_PD]) { | |
| 423 if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD", | |
| 424 fd->version)) | |
| 425 return NULL; | |
| 426 mc++; | |
| 427 } | |
| 428 if (h->codecs[DS_HC]) { | |
| 429 if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC", | |
| 430 fd->version)) | |
| 431 return NULL; | |
| 432 mc++; | |
| 433 } | |
| 434 } | |
| 435 if (h->codecs[DS_TM]) { | |
| 436 if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM", | |
| 437 fd->version)) | |
| 438 return NULL; | |
| 439 mc++; | |
| 440 } | |
| 441 if (h->codecs[DS_TV]) { | |
| 442 if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV", | |
| 443 fd->version)) | |
| 444 return NULL; | |
| 445 mc++; | |
| 446 } | |
| 447 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); | |
| 448 itf8_put_blk(cb, mc); | |
| 449 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); | |
| 450 | |
| 451 /* tag encoding map */ | |
| 452 #if 0 | |
| 453 mp = map; mc = 0; | |
| 454 if (h->tag_encoding_map) { | |
| 455 HashItem *hi; | |
| 456 HashIter *iter = HashTableIterCreate(); | |
| 457 if (!iter) | |
| 458 return NULL; | |
| 459 | |
| 460 while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { | |
| 461 cram_map *m = hi->data.p; | |
| 462 int sz; | |
| 463 | |
| 464 mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); | |
| 465 if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) | |
| 466 return NULL; | |
| 467 mp += sz; | |
| 468 mc++; | |
| 469 } | |
| 470 | |
| 471 HashTableIterDestroy(iter); | |
| 472 } | |
| 473 #else | |
| 474 mc = 0; | |
| 475 BLOCK_SIZE(map) = 0; | |
| 476 if (c->tags_used) { | |
| 477 khint_t k; | |
| 478 | |
| 479 #define TAG_ID(a) ((#a[0]<<8)+#a[1]) | |
| 480 | |
| 481 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { | |
| 482 int key; | |
| 483 if (!kh_exist(c->tags_used, k)) | |
| 484 continue; | |
| 485 | |
| 486 mc++; | |
| 487 itf8_put_blk(map, kh_key(c->tags_used, k)); | |
| 488 | |
| 489 // use block content id 4 | |
| 490 switch((key = kh_key(c->tags_used, k)) & 0xff) { | |
| 491 case 'Z': case 'H': | |
| 492 // string as byte_array_stop | |
| 493 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 494 BLOCK_APPEND(map, | |
| 495 "\005" // BYTE_ARRAY_STOP | |
| 496 "\005" // len | |
| 497 "\t" // stop-byte is also SAM separator | |
| 498 DS_aux_S "\000\000\000", | |
| 499 7); | |
| 500 } else { | |
| 501 if (key>>8 == TAG_ID(OQ)) | |
| 502 BLOCK_APPEND(map, | |
| 503 "\005" // BYTE_ARRAY_STOP | |
| 504 "\002" // len | |
| 505 "\t" // stop-byte is also SAM separator | |
| 506 DS_aux_OQ_S, | |
| 507 4); | |
| 508 else if (key>>8 == TAG_ID(BQ)) | |
| 509 BLOCK_APPEND(map, | |
| 510 "\005" // BYTE_ARRAY_STOP | |
| 511 "\002" // len | |
| 512 "\t" // stop-byte is also SAM separator | |
| 513 DS_aux_BQ_S, | |
| 514 4); | |
| 515 else if (key>>8 == TAG_ID(BD)) | |
| 516 BLOCK_APPEND(map, | |
| 517 "\005" // BYTE_ARRAY_STOP | |
| 518 "\002" // len | |
| 519 "\t" // stop-byte is also SAM separator | |
| 520 DS_aux_BD_S, | |
| 521 4); | |
| 522 else if (key>>8 == TAG_ID(BI)) | |
| 523 BLOCK_APPEND(map, | |
| 524 "\005" // BYTE_ARRAY_STOP | |
| 525 "\002" // len | |
| 526 "\t" // stop-byte is also SAM separator | |
| 527 DS_aux_BI_S, | |
| 528 4); | |
| 529 else if ((key>>8 == TAG_ID(Q2)) || | |
| 530 (key>>8 == TAG_ID(U2)) || | |
| 531 (key>>8 == TAG_ID(QT)) || | |
| 532 (key>>8 == TAG_ID(CQ))) | |
| 533 BLOCK_APPEND(map, | |
| 534 "\005" // BYTE_ARRAY_STOP | |
| 535 "\002" // len | |
| 536 "\t" // stop-byte is also SAM separator | |
| 537 DS_aux_oq_S, | |
| 538 4); | |
| 539 else if ((key>>8 == TAG_ID(R2)) || | |
| 540 (key>>8 == TAG_ID(E2)) || | |
| 541 (key>>8 == TAG_ID(CS)) || | |
| 542 (key>>8 == TAG_ID(BC)) || | |
| 543 (key>>8 == TAG_ID(RT))) | |
| 544 BLOCK_APPEND(map, | |
| 545 "\005" // BYTE_ARRAY_STOP | |
| 546 "\002" // len | |
| 547 "\t" // stop-byte is also SAM separator | |
| 548 DS_aux_os_S, | |
| 549 4); | |
| 550 else | |
| 551 BLOCK_APPEND(map, | |
| 552 "\005" // BYTE_ARRAY_STOP | |
| 553 "\002" // len | |
| 554 "\t" // stop-byte is also SAM separator | |
| 555 DS_aux_oz_S, | |
| 556 4); | |
| 557 } | |
| 558 break; | |
| 559 | |
| 560 case 'A': case 'c': case 'C': | |
| 561 // byte array len, 1 byte | |
| 562 BLOCK_APPEND(map, | |
| 563 "\004" // BYTE_ARRAY_LEN | |
| 564 "\011" // length | |
| 565 "\003" // HUFFMAN (len) | |
| 566 "\004" // huffman-len | |
| 567 "\001" // 1 symbol | |
| 568 "\001" // symbol=1 byte value | |
| 569 "\001" // 1 length | |
| 570 "\000" // length=0 | |
| 571 "\001" // EXTERNAL (val) | |
| 572 "\001" // external-len | |
| 573 DS_aux_S,// content-id | |
| 574 11); | |
| 575 break; | |
| 576 | |
| 577 case 's': case 'S': | |
| 578 // byte array len, 2 byte | |
| 579 BLOCK_APPEND(map, | |
| 580 "\004" // BYTE_ARRAY_LEN | |
| 581 "\011" // length | |
| 582 "\003" // HUFFMAN (len) | |
| 583 "\004" // huffman-len | |
| 584 "\001" // 1 symbol | |
| 585 "\002" // symbol=2 byte value | |
| 586 "\001" // 1 length | |
| 587 "\000" // length=0 | |
| 588 "\001" // EXTERNAL (val) | |
| 589 "\001" // external-len | |
| 590 DS_aux_S,// content-id | |
| 591 11); | |
| 592 break; | |
| 593 | |
| 594 case 'i': case 'I': case 'f': | |
| 595 // byte array len, 4 byte | |
| 596 BLOCK_APPEND(map, | |
| 597 "\004" // BYTE_ARRAY_LEN | |
| 598 "\011" // length | |
| 599 "\003" // HUFFMAN (len) | |
| 600 "\004" // huffman-len | |
| 601 "\001" // 1 symbol | |
| 602 "\004" // symbol=4 byte value | |
| 603 "\001" // 1 length | |
| 604 "\000" // length=0 | |
| 605 "\001" // EXTERNAL (val) | |
| 606 "\001" // external-len | |
| 607 DS_aux_S,// content-id | |
| 608 11); | |
| 609 break; | |
| 610 | |
| 611 case 'B': | |
| 612 // Byte array of variable size, but we generate our tag | |
| 613 // byte stream at the wrong stage (during reading and not | |
| 614 // after slice header construction). So we use | |
| 615 // BYTE_ARRAY_LEN with the length codec being external | |
| 616 // too. | |
| 617 if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM))) | |
| 618 BLOCK_APPEND(map, | |
| 619 "\004" // BYTE_ARRAY_LEN | |
| 620 "\006" // length | |
| 621 "\001" // EXTERNAL (len) | |
| 622 "\001" // external-len | |
| 623 DS_aux_FZ_S // content-id | |
| 624 "\001" // EXTERNAL (val) | |
| 625 "\001" // external-len | |
| 626 DS_aux_FZ_S,// content-id | |
| 627 8); | |
| 628 else | |
| 629 BLOCK_APPEND(map, | |
| 630 "\004" // BYTE_ARRAY_LEN | |
| 631 "\006" // length | |
| 632 "\001" // EXTERNAL (len) | |
| 633 "\001" // external-len | |
| 634 DS_aux_S // content-id | |
| 635 "\001" // EXTERNAL (val) | |
| 636 "\001" // external-len | |
| 637 DS_aux_S,// content-id | |
| 638 8); | |
| 639 break; | |
| 640 | |
| 641 default: | |
| 642 fprintf(stderr, "Unsupported SAM aux type '%c'\n", | |
| 643 kh_key(c->tags_used, k) & 0xff); | |
| 644 } | |
| 645 //mp += m->codec->store(m->codec, mp, NULL, fd->version); | |
| 646 } | |
| 647 } | |
| 648 #endif | |
| 649 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); | |
| 650 itf8_put_blk(cb, mc); | |
| 651 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); | |
| 652 | |
| 653 if (fd->verbose) | |
| 654 fprintf(stderr, "Wrote compression block header in %d bytes\n", | |
| 655 (int)BLOCK_SIZE(cb)); | |
| 656 | |
| 657 BLOCK_UPLEN(cb); | |
| 658 | |
| 659 cram_free_block(map); | |
| 660 | |
| 661 return cb; | |
| 662 } | |
| 663 | |
| 664 | |
| 665 /* | |
| 666 * Encodes a slice compression header. | |
| 667 * | |
| 668 * Returns cram_block on success | |
| 669 * NULL on failure | |
| 670 */ | |
| 671 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { | |
| 672 char *buf; | |
| 673 char *cp; | |
| 674 cram_block *b = cram_new_block(MAPPED_SLICE, 0); | |
| 675 int j; | |
| 676 | |
| 677 if (!b) | |
| 678 return NULL; | |
| 679 | |
| 680 if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) { | |
| 681 cram_free_block(b); | |
| 682 return NULL; | |
| 683 } | |
| 684 | |
| 685 cp += itf8_put(cp, s->hdr->ref_seq_id); | |
| 686 cp += itf8_put(cp, s->hdr->ref_seq_start); | |
| 687 cp += itf8_put(cp, s->hdr->ref_seq_span); | |
| 688 cp += itf8_put(cp, s->hdr->num_records); | |
| 689 if (CRAM_MAJOR_VERS(fd->version) == 2) | |
| 690 cp += itf8_put(cp, s->hdr->record_counter); | |
| 691 else if (CRAM_MAJOR_VERS(fd->version) >= 3) | |
| 692 cp += ltf8_put(cp, s->hdr->record_counter); | |
| 693 cp += itf8_put(cp, s->hdr->num_blocks); | |
| 694 cp += itf8_put(cp, s->hdr->num_content_ids); | |
| 695 for (j = 0; j < s->hdr->num_content_ids; j++) { | |
| 696 cp += itf8_put(cp, s->hdr->block_content_ids[j]); | |
| 697 } | |
| 698 if (s->hdr->content_type == MAPPED_SLICE) | |
| 699 cp += itf8_put(cp, s->hdr->ref_base_id); | |
| 700 | |
| 701 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
| 702 memcpy(cp, s->hdr->md5, 16); cp += 16; | |
| 703 } | |
| 704 | |
| 705 assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); | |
| 706 | |
| 707 b->data = (unsigned char *)buf; | |
| 708 b->comp_size = b->uncomp_size = cp-buf; | |
| 709 | |
| 710 return b; | |
| 711 } | |
| 712 | |
| 713 | |
| 714 /* | |
| 715 * Encodes a single read. | |
| 716 * | |
| 717 * Returns 0 on success | |
| 718 * -1 on failure | |
| 719 */ | |
| 720 static int cram_encode_slice_read(cram_fd *fd, | |
| 721 cram_container *c, | |
| 722 cram_block_compression_hdr *h, | |
| 723 cram_slice *s, | |
| 724 cram_record *cr, | |
| 725 int *last_pos) { | |
| 726 int r = 0; | |
| 727 int32_t i32; | |
| 728 unsigned char uc; | |
| 729 | |
| 730 //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); | |
| 731 | |
| 732 //printf("BF=0x%x\n", cr->flags); | |
| 733 // bf = cram_flag_swap[cr->flags]; | |
| 734 i32 = fd->cram_flag_swap[cr->flags & 0xfff]; | |
| 735 r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1); | |
| 736 | |
| 737 i32 = cr->cram_flags; | |
| 738 r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1); | |
| 739 | |
| 740 if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2) | |
| 741 r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1); | |
| 742 | |
| 743 r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); | |
| 744 | |
| 745 if (c->pos_sorted) { | |
| 746 i32 = cr->apos - *last_pos; | |
| 747 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); | |
| 748 *last_pos = cr->apos; | |
| 749 } else { | |
| 750 i32 = cr->apos; | |
| 751 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); | |
| 752 } | |
| 753 | |
| 754 r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); | |
| 755 | |
| 756 if (c->comp_hdr->read_names_included) { | |
| 757 // RN codec: Already stored in block[3]. | |
| 758 } | |
| 759 | |
| 760 if (cr->cram_flags & CRAM_FLAG_DETACHED) { | |
| 761 i32 = cr->mate_flags; | |
| 762 r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1); | |
| 763 | |
| 764 if (!c->comp_hdr->read_names_included) { | |
| 765 // RN codec: Already stored in block[3]. | |
| 766 } | |
| 767 | |
| 768 r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], | |
| 769 (char *)&cr->mate_ref_id, 1); | |
| 770 | |
| 771 r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], | |
| 772 (char *)&cr->mate_pos, 1); | |
| 773 | |
| 774 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], | |
| 775 (char *)&cr->tlen, 1); | |
| 776 } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { | |
| 777 r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], | |
| 778 (char *)&cr->mate_line, 1); | |
| 779 } | |
| 780 | |
| 781 /* Aux tags */ | |
| 782 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 783 int j; | |
| 784 uc = cr->ntags; | |
| 785 r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1); | |
| 786 | |
| 787 for (j = 0; j < cr->ntags; j++) { | |
| 788 uint32_t i32 = s->TN[cr->TN_idx + j]; // id | |
| 789 r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1); | |
| 790 } | |
| 791 } else { | |
| 792 r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1); | |
| 793 } | |
| 794 | |
| 795 // qual | |
| 796 // QS codec : Already stored in block[2]. | |
| 797 | |
| 798 // features (diffs) | |
| 799 if (!(cr->flags & BAM_FUNMAP)) { | |
| 800 int prev_pos = 0, j; | |
| 801 | |
| 802 r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN], | |
| 803 (char *)&cr->nfeature, 1); | |
| 804 for (j = 0; j < cr->nfeature; j++) { | |
| 805 cram_feature *f = &s->features[cr->feature + j]; | |
| 806 | |
| 807 uc = f->X.code; | |
| 808 r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1); | |
| 809 i32 = f->X.pos - prev_pos; | |
| 810 r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1); | |
| 811 prev_pos = f->X.pos; | |
| 812 | |
| 813 switch(f->X.code) { | |
| 814 //char *seq; | |
| 815 | |
| 816 case 'X': | |
| 817 //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base); | |
| 818 | |
| 819 uc = f->X.base; | |
| 820 r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS], | |
| 821 (char *)&uc, 1); | |
| 822 break; | |
| 823 case 'S': | |
| 824 // Already done | |
| 825 // r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC], | |
| 826 // BLOCK_DATA(s->soft_blk) + f->S.seq_idx, | |
| 827 // f->S.len); | |
| 828 | |
| 829 // if (IS_CRAM_3_VERS(fd)) { | |
| 830 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], | |
| 831 // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, | |
| 832 // f->S.len); | |
| 833 // } | |
| 834 break; | |
| 835 case 'I': | |
| 836 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; | |
| 837 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], | |
| 838 // seq, f->S.len); | |
| 839 // if (IS_CRAM_3_VERS(fd)) { | |
| 840 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], | |
| 841 // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, | |
| 842 // f->I.len); | |
| 843 // } | |
| 844 break; | |
| 845 case 'i': | |
| 846 uc = f->i.base; | |
| 847 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], | |
| 848 (char *)&uc, 1); | |
| 849 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; | |
| 850 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], | |
| 851 // seq, 1); | |
| 852 break; | |
| 853 case 'D': | |
| 854 i32 = f->D.len; | |
| 855 r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL], | |
| 856 (char *)&i32, 1); | |
| 857 break; | |
| 858 | |
| 859 case 'B': | |
| 860 // // Used when we try to store a non ACGTN base or an N | |
| 861 // // that aligns against a non ACGTN reference | |
| 862 | |
| 863 uc = f->B.base; | |
| 864 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], | |
| 865 (char *)&uc, 1); | |
| 866 | |
| 867 // Already added | |
| 868 // uc = f->B.qual; | |
| 869 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], | |
| 870 // (char *)&uc, 1); | |
| 871 break; | |
| 872 | |
| 873 case 'b': | |
| 874 // string of bases | |
| 875 r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], | |
| 876 (char *)BLOCK_DATA(s->seqs_blk) | |
| 877 + f->b.seq_idx, | |
| 878 f->b.len); | |
| 879 break; | |
| 880 | |
| 881 case 'Q': | |
| 882 // Already added | |
| 883 // uc = f->B.qual; | |
| 884 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], | |
| 885 // (char *)&uc, 1); | |
| 886 break; | |
| 887 | |
| 888 case 'N': | |
| 889 i32 = f->N.len; | |
| 890 r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS], | |
| 891 (char *)&i32, 1); | |
| 892 break; | |
| 893 | |
| 894 case 'P': | |
| 895 i32 = f->P.len; | |
| 896 r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD], | |
| 897 (char *)&i32, 1); | |
| 898 break; | |
| 899 | |
| 900 case 'H': | |
| 901 i32 = f->H.len; | |
| 902 r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC], | |
| 903 (char *)&i32, 1); | |
| 904 break; | |
| 905 | |
| 906 | |
| 907 default: | |
| 908 fprintf(stderr, "unhandled feature code %c\n", | |
| 909 f->X.code); | |
| 910 return -1; | |
| 911 } | |
| 912 } | |
| 913 | |
| 914 r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ], | |
| 915 (char *)&cr->mqual, 1); | |
| 916 } else { | |
| 917 char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; | |
| 918 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len); | |
| 919 } | |
| 920 | |
| 921 return r ? -1 : 0; | |
| 922 } | |
| 923 | |
| 924 | |
| 925 /* | |
| 926 * Applies various compression methods to specific blocks, depending on | |
| 927 * known observations of how data series compress. | |
| 928 * | |
| 929 * Returns 0 on success | |
| 930 * -1 on failure | |
| 931 */ | |
| 932 static int cram_compress_slice(cram_fd *fd, cram_slice *s) { | |
| 933 int level = fd->level, i; | |
| 934 int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method; | |
| 935 | |
| 936 /* Compress the CORE Block too, with minimal zlib level */ | |
| 937 if (level > 5 && s->block[0]->uncomp_size > 500) | |
| 938 cram_compress_block(fd, s->block[0], NULL, GZIP, 1); | |
| 939 | |
| 940 if (fd->use_bz2) | |
| 941 method |= 1<<BZIP2; | |
| 942 | |
| 943 if (fd->use_rans) | |
| 944 method |= (1<<RANS0) | (1<<RANS1); | |
| 945 | |
| 946 if (fd->use_lzma) | |
| 947 method |= (1<<LZMA); | |
| 948 | |
| 949 /* Faster method for data series we only need entropy encoding on */ | |
| 950 methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA); | |
| 951 if (level >= 6) | |
| 952 methodF = method; | |
| 953 | |
| 954 | |
| 955 /* Specific compression methods for certain block types */ | |
| 956 if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq) | |
| 957 method, level)) | |
| 958 return -1; | |
| 959 | |
| 960 if (fd->level == 0) { | |
| 961 /* Do nothing */ | |
| 962 } else if (fd->level == 1) { | |
| 963 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], | |
| 964 methodF, 1)) | |
| 965 return -1; | |
| 966 for (i = DS_aux; i <= DS_aux_oz; i++) { | |
| 967 if (s->block[i]) | |
| 968 if (cram_compress_block(fd, s->block[i], fd->m[i], | |
| 969 method, 1)) | |
| 970 return -1; | |
| 971 } | |
| 972 } else if (fd->level < 3) { | |
| 973 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], | |
| 974 method, 1)) | |
| 975 return -1; | |
| 976 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], | |
| 977 method, 1)) | |
| 978 return -1; | |
| 979 if (s->block[DS_BB]) | |
| 980 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], | |
| 981 method, 1)) | |
| 982 return -1; | |
| 983 for (i = DS_aux; i <= DS_aux_oz; i++) { | |
| 984 if (s->block[i]) | |
| 985 if (cram_compress_block(fd, s->block[i], fd->m[i], | |
| 986 method, level)) | |
| 987 return -1; | |
| 988 } | |
| 989 } else { | |
| 990 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], | |
| 991 method, level)) | |
| 992 return -1; | |
| 993 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], | |
| 994 method, level)) | |
| 995 return -1; | |
| 996 if (s->block[DS_BB]) | |
| 997 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], | |
| 998 method, level)) | |
| 999 return -1; | |
| 1000 for (i = DS_aux; i <= DS_aux_oz; i++) { | |
| 1001 if (s->block[i]) | |
| 1002 if (cram_compress_block(fd, s->block[i], fd->m[i], | |
| 1003 method, level)) | |
| 1004 return -1; | |
| 1005 } | |
| 1006 } | |
| 1007 | |
| 1008 // NAME: best is generally xz, bzip2, zlib then rans1 | |
| 1009 // It benefits well from a little bit extra compression level. | |
| 1010 if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], | |
| 1011 method & ~(1<<RANS0 | 1<<GZIP_RLE), | |
| 1012 MIN(9,level))) | |
| 1013 return -1; | |
| 1014 | |
| 1015 // NS shows strong local correlation as rearrangements are localised | |
| 1016 if (s->block[DS_NS] != s->block[0]) | |
| 1017 if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], | |
| 1018 method, level)) | |
| 1019 return -1; | |
| 1020 | |
| 1021 | |
| 1022 /* | |
| 1023 * Minimal compression of any block still uncompressed, bar CORE | |
| 1024 */ | |
| 1025 { | |
| 1026 int i; | |
| 1027 for (i = 1; i < DS_END; i++) { | |
| 1028 if (!s->block[i] || s->block[i] == s->block[0]) | |
| 1029 continue; | |
| 1030 | |
| 1031 // fast methods only | |
| 1032 if (s->block[i]->method == RAW) { | |
| 1033 cram_compress_block(fd, s->block[i], fd->m[i], | |
| 1034 methodF, level); | |
| 1035 } | |
| 1036 } | |
| 1037 } | |
| 1038 | |
| 1039 return 0; | |
| 1040 } | |
| 1041 | |
| 1042 /* | |
| 1043 * Encodes a single slice from a container | |
| 1044 * | |
| 1045 * Returns 0 on success | |
| 1046 * -1 on failure | |
| 1047 */ | |
| 1048 static int cram_encode_slice(cram_fd *fd, cram_container *c, | |
| 1049 cram_block_compression_hdr *h, cram_slice *s) { | |
| 1050 int rec, r = 0, last_pos; | |
| 1051 int embed_ref; | |
| 1052 enum cram_DS_ID id; | |
| 1053 | |
| 1054 embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; | |
| 1055 | |
| 1056 /* | |
| 1057 * Slice external blocks: | |
| 1058 * ID 0 => base calls (insertions, soft-clip) | |
| 1059 * ID 1 => qualities | |
| 1060 * ID 2 => names | |
| 1061 * ID 3 => TS (insert size), NP (next frag) | |
| 1062 * ID 4 => tag values | |
| 1063 * ID 6 => tag IDs (TN), if CRAM_V1.0 | |
| 1064 * ID 7 => TD tag dictionary, if !CRAM_V1.0 | |
| 1065 */ | |
| 1066 | |
| 1067 /* Create cram slice header */ | |
| 1068 s->hdr->ref_base_id = embed_ref ? DS_ref : -1; | |
| 1069 s->hdr->record_counter = c->num_records + c->record_counter; | |
| 1070 c->num_records += s->hdr->num_records; | |
| 1071 | |
| 1072 s->block = calloc(DS_END, sizeof(s->block[0])); | |
| 1073 s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); | |
| 1074 if (!s->block || !s->hdr->block_content_ids) | |
| 1075 return -1; | |
| 1076 | |
| 1077 // Create first fixed blocks, always external. | |
| 1078 // CORE | |
| 1079 if (!(s->block[0] = cram_new_block(CORE, 0))) | |
| 1080 return -1; | |
| 1081 | |
| 1082 // TN block for CRAM v1 | |
| 1083 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 1084 if (h->codecs[DS_TN]->codec == E_EXTERNAL) { | |
| 1085 if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1; | |
| 1086 h->codecs[DS_TN]->external.content_id = DS_TN; | |
| 1087 } else { | |
| 1088 s->block[DS_TN] = s->block[0]; | |
| 1089 } | |
| 1090 s->block[DS_TN] = s->block[DS_TN]; | |
| 1091 } | |
| 1092 | |
| 1093 // Embedded reference | |
| 1094 if (embed_ref) { | |
| 1095 if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) | |
| 1096 return -1; | |
| 1097 s->ref_id = DS_ref; // needed? | |
| 1098 BLOCK_APPEND(s->block[DS_ref], | |
| 1099 c->ref + c->first_base - c->ref_start, | |
| 1100 c->last_base - c->first_base + 1); | |
| 1101 } | |
| 1102 | |
| 1103 /* | |
| 1104 * All the data-series blocks if appropriate. | |
| 1105 */ | |
| 1106 for (id = DS_BF; id < DS_TN; id++) { | |
| 1107 if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || | |
| 1108 h->codecs[id]->codec == E_BYTE_ARRAY_STOP || | |
| 1109 h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { | |
| 1110 switch (h->codecs[id]->codec) { | |
| 1111 case E_EXTERNAL: | |
| 1112 if (!(s->block[id] = cram_new_block(EXTERNAL, id))) | |
| 1113 return -1; | |
| 1114 h->codecs[id]->external.content_id = id; | |
| 1115 break; | |
| 1116 | |
| 1117 case E_BYTE_ARRAY_STOP: | |
| 1118 if (!(s->block[id] = cram_new_block(EXTERNAL, id))) | |
| 1119 return -1; | |
| 1120 h->codecs[id]->byte_array_stop.content_id = id; | |
| 1121 break; | |
| 1122 | |
| 1123 case E_BYTE_ARRAY_LEN: { | |
| 1124 cram_codec *cc; | |
| 1125 | |
| 1126 cc = h->codecs[id]->e_byte_array_len.len_codec; | |
| 1127 if (cc->codec == E_EXTERNAL) { | |
| 1128 int eid = cc->external.content_id; | |
| 1129 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) | |
| 1130 return -1; | |
| 1131 cc->external.content_id = eid; | |
| 1132 cc->out = s->block[eid]; | |
| 1133 } | |
| 1134 | |
| 1135 cc = h->codecs[id]->e_byte_array_len.val_codec; | |
| 1136 if (cc->codec == E_EXTERNAL) { | |
| 1137 int eid = cc->external.content_id; | |
| 1138 if (!s->block[eid]) | |
| 1139 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) | |
| 1140 return -1; | |
| 1141 cc->external.content_id = eid; | |
| 1142 cc->out = s->block[eid]; | |
| 1143 } | |
| 1144 break; | |
| 1145 } | |
| 1146 default: | |
| 1147 break; | |
| 1148 } | |
| 1149 } else { | |
| 1150 if (!(id == DS_BB && !h->codecs[DS_BB])) | |
| 1151 s->block[id] = s->block[0]; | |
| 1152 } | |
| 1153 if (h->codecs[id]) | |
| 1154 h->codecs[id]->out = s->block[id]; | |
| 1155 } | |
| 1156 | |
| 1157 /* Encode reads */ | |
| 1158 last_pos = s->hdr->ref_seq_start; | |
| 1159 for (rec = 0; rec < s->hdr->num_records; rec++) { | |
| 1160 cram_record *cr = &s->crecs[rec]; | |
| 1161 if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1) | |
| 1162 return -1; | |
| 1163 } | |
| 1164 | |
| 1165 s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7); | |
| 1166 s->block[0]->comp_size = s->block[0]->uncomp_size; | |
| 1167 | |
| 1168 // Make sure the fixed blocks point to the correct sources | |
| 1169 s->block[DS_IN] = s->base_blk; s->base_blk = NULL; | |
| 1170 s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; | |
| 1171 s->block[DS_RN] = s->name_blk; s->name_blk = NULL; | |
| 1172 s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; | |
| 1173 s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL; | |
| 1174 s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL; | |
| 1175 s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL; | |
| 1176 s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL; | |
| 1177 s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL; | |
| 1178 s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL; | |
| 1179 s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL; | |
| 1180 s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL; | |
| 1181 s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL; | |
| 1182 | |
| 1183 // Ensure block sizes are up to date. | |
| 1184 for (id = 1; id < DS_END; id++) { | |
| 1185 if (!s->block[id] || s->block[id] == s->block[0]) | |
| 1186 continue; | |
| 1187 | |
| 1188 if (s->block[id]->uncomp_size == 0) | |
| 1189 BLOCK_UPLEN(s->block[id]); | |
| 1190 } | |
| 1191 | |
| 1192 // Compress it all | |
| 1193 if (cram_compress_slice(fd, s) == -1) | |
| 1194 return -1; | |
| 1195 | |
| 1196 // Collapse empty blocks and create hdr_block | |
| 1197 { | |
| 1198 int i, j; | |
| 1199 for (i = j = 1; i < DS_END; i++) { | |
| 1200 if (!s->block[i] || s->block[i] == s->block[0]) | |
| 1201 continue; | |
| 1202 if (s->block[i]->uncomp_size == 0) { | |
| 1203 cram_free_block(s->block[i]); | |
| 1204 s->block[i] = NULL; | |
| 1205 continue; | |
| 1206 } | |
| 1207 s->block[j] = s->block[i]; | |
| 1208 s->hdr->block_content_ids[j-1] = s->block[i]->content_id; | |
| 1209 j++; | |
| 1210 } | |
| 1211 s->hdr->num_content_ids = j-1; | |
| 1212 s->hdr->num_blocks = j; | |
| 1213 | |
| 1214 if (!(s->hdr_block = cram_encode_slice_header(fd, s))) | |
| 1215 return -1; | |
| 1216 } | |
| 1217 | |
| 1218 return r ? -1 : 0; | |
| 1219 } | |
| 1220 | |
| 1221 /* | |
| 1222 * Encodes all slices in a container into blocks. | |
| 1223 * Returns 0 on success | |
| 1224 * -1 on failure | |
| 1225 */ | |
| 1226 int cram_encode_container(cram_fd *fd, cram_container *c) { | |
| 1227 int i, j, slice_offset; | |
| 1228 cram_block_compression_hdr *h = c->comp_hdr; | |
| 1229 cram_block *c_hdr; | |
| 1230 int multi_ref = 0; | |
| 1231 int r1, r2, sn, nref; | |
| 1232 spare_bams *spares; | |
| 1233 | |
| 1234 /* Cache references up-front if we have unsorted access patterns */ | |
| 1235 pthread_mutex_lock(&fd->ref_lock); | |
| 1236 nref = fd->refs->nref; | |
| 1237 pthread_mutex_unlock(&fd->ref_lock); | |
| 1238 | |
| 1239 if (!fd->no_ref && c->refs_used) { | |
| 1240 for (i = 0; i < nref; i++) { | |
| 1241 if (c->refs_used[i]) | |
| 1242 cram_get_ref(fd, i, 1, 0); | |
| 1243 } | |
| 1244 } | |
| 1245 | |
| 1246 /* To create M5 strings */ | |
| 1247 /* Fetch reference sequence */ | |
| 1248 if (!fd->no_ref) { | |
| 1249 bam_seq_t *b = c->bams[0]; | |
| 1250 char *ref; | |
| 1251 | |
| 1252 ref = cram_get_ref(fd, bam_ref(b), 1, 0); | |
| 1253 if (!ref && bam_ref(b) >= 0) { | |
| 1254 fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b)); | |
| 1255 return -1; | |
| 1256 } | |
| 1257 if ((c->ref_id = bam_ref(b)) >= 0) { | |
| 1258 c->ref_seq_id = c->ref_id; | |
| 1259 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; | |
| 1260 c->ref_start = 1; | |
| 1261 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; | |
| 1262 } else { | |
| 1263 c->ref_seq_id = c->ref_id; // FIXME remove one var! | |
| 1264 } | |
| 1265 } else { | |
| 1266 c->ref_id = bam_ref(c->bams[0]); | |
| 1267 cram_ref_incr(fd->refs, c->ref_id); | |
| 1268 c->ref_seq_id = c->ref_id; | |
| 1269 } | |
| 1270 | |
| 1271 /* Turn bams into cram_records and gather basic stats */ | |
| 1272 for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { | |
| 1273 cram_slice *s = c->slices[sn]; | |
| 1274 int first_base = INT_MAX, last_base = INT_MIN; | |
| 1275 | |
| 1276 assert(sn < c->curr_slice); | |
| 1277 | |
| 1278 /* FIXME: we could create our slice objects here too instead of | |
| 1279 * in cram_put_bam_seq. It's more natural here and also this is | |
| 1280 * bit is threaded so it's less work in the main thread. | |
| 1281 */ | |
| 1282 | |
| 1283 for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) { | |
| 1284 cram_record *cr = &s->crecs[r2]; | |
| 1285 bam_seq_t *b = c->bams[r1]; | |
| 1286 | |
| 1287 /* If multi-ref we need to cope with changing reference per seq */ | |
| 1288 if (c->multi_seq && !fd->no_ref) { | |
| 1289 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { | |
| 1290 if (c->ref_seq_id >= 0) | |
| 1291 cram_ref_decr(fd->refs, c->ref_seq_id); | |
| 1292 | |
| 1293 if (!cram_get_ref(fd, bam_ref(b), 1, 0)) { | |
| 1294 fprintf(stderr, "Failed to load reference #%d\n", | |
| 1295 bam_ref(b)); | |
| 1296 return -1; | |
| 1297 } | |
| 1298 | |
| 1299 c->ref_seq_id = bam_ref(b); // overwritten later by -2 | |
| 1300 assert(fd->refs->ref_id[c->ref_seq_id]->seq); | |
| 1301 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; | |
| 1302 c->ref_start = 1; | |
| 1303 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; | |
| 1304 } | |
| 1305 } | |
| 1306 | |
| 1307 process_one_read(fd, c, s, cr, b, r2); | |
| 1308 | |
| 1309 if (first_base > cr->apos) | |
| 1310 first_base = cr->apos; | |
| 1311 | |
| 1312 if (last_base < cr->aend) | |
| 1313 last_base = cr->aend; | |
| 1314 } | |
| 1315 | |
| 1316 if (c->multi_seq) { | |
| 1317 s->hdr->ref_seq_id = -2; | |
| 1318 s->hdr->ref_seq_start = 0; | |
| 1319 s->hdr->ref_seq_span = 0; | |
| 1320 } else { | |
| 1321 s->hdr->ref_seq_id = c->ref_id; | |
| 1322 s->hdr->ref_seq_start = first_base; | |
| 1323 s->hdr->ref_seq_span = last_base - first_base + 1; | |
| 1324 } | |
| 1325 s->hdr->num_records = r2; | |
| 1326 } | |
| 1327 | |
| 1328 if (c->multi_seq && !fd->no_ref) { | |
| 1329 if (c->ref_seq_id >= 0) | |
| 1330 cram_ref_decr(fd->refs, c->ref_seq_id); | |
| 1331 } | |
| 1332 | |
| 1333 /* Link our bams[] array onto the spare bam list for reuse */ | |
| 1334 spares = malloc(sizeof(*spares)); | |
| 1335 pthread_mutex_lock(&fd->bam_list_lock); | |
| 1336 spares->bams = c->bams; | |
| 1337 spares->next = fd->bl; | |
| 1338 fd->bl = spares; | |
| 1339 pthread_mutex_unlock(&fd->bam_list_lock); | |
| 1340 c->bams = NULL; | |
| 1341 | |
| 1342 /* Detect if a multi-seq container */ | |
| 1343 cram_stats_encoding(fd, c->stats[DS_RI]); | |
| 1344 multi_ref = c->stats[DS_RI]->nvals > 1; | |
| 1345 | |
| 1346 if (multi_ref) { | |
| 1347 if (fd->verbose) | |
| 1348 fprintf(stderr, "Multi-ref container\n"); | |
| 1349 c->ref_seq_id = -2; | |
| 1350 c->ref_seq_start = 0; | |
| 1351 c->ref_seq_span = 0; | |
| 1352 } | |
| 1353 | |
| 1354 | |
| 1355 /* Compute MD5s */ | |
| 1356 for (i = 0; i < c->curr_slice; i++) { | |
| 1357 cram_slice *s = c->slices[i]; | |
| 1358 | |
| 1359 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
| 1360 if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { | |
| 1361 MD5_CTX md5; | |
| 1362 MD5_Init(&md5); | |
| 1363 MD5_Update(&md5, | |
| 1364 c->ref + s->hdr->ref_seq_start - c->ref_start, | |
| 1365 s->hdr->ref_seq_span); | |
| 1366 MD5_Final(s->hdr->md5, &md5); | |
| 1367 } else { | |
| 1368 memset(s->hdr->md5, 0, 16); | |
| 1369 } | |
| 1370 } | |
| 1371 } | |
| 1372 | |
| 1373 c->num_records = 0; | |
| 1374 c->num_blocks = 0; | |
| 1375 c->length = 0; | |
| 1376 | |
| 1377 //fprintf(stderr, "=== BF ===\n"); | |
| 1378 h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), | |
| 1379 c->stats[DS_BF], E_INT, NULL, | |
| 1380 fd->version); | |
| 1381 | |
| 1382 //fprintf(stderr, "=== CF ===\n"); | |
| 1383 h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), | |
| 1384 c->stats[DS_CF], E_INT, NULL, | |
| 1385 fd->version); | |
| 1386 // fprintf(stderr, "=== RN ===\n"); | |
| 1387 // h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]), | |
| 1388 // c->stats[DS_RN], E_BYTE_ARRAY, NULL, | |
| 1389 // fd->version); | |
| 1390 | |
| 1391 //fprintf(stderr, "=== AP ===\n"); | |
| 1392 if (c->pos_sorted) { | |
| 1393 h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), | |
| 1394 c->stats[DS_AP], E_INT, NULL, | |
| 1395 fd->version); | |
| 1396 } else { | |
| 1397 int p[2] = {0, c->max_apos}; | |
| 1398 h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, | |
| 1399 fd->version); | |
| 1400 } | |
| 1401 | |
| 1402 //fprintf(stderr, "=== RG ===\n"); | |
| 1403 h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), | |
| 1404 c->stats[DS_RG], E_INT, NULL, | |
| 1405 fd->version); | |
| 1406 | |
| 1407 //fprintf(stderr, "=== MQ ===\n"); | |
| 1408 h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), | |
| 1409 c->stats[DS_MQ], E_INT, NULL, | |
| 1410 fd->version); | |
| 1411 | |
| 1412 //fprintf(stderr, "=== NS ===\n"); | |
| 1413 h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), | |
| 1414 c->stats[DS_NS], E_INT, NULL, | |
| 1415 fd->version); | |
| 1416 | |
| 1417 //fprintf(stderr, "=== MF ===\n"); | |
| 1418 h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), | |
| 1419 c->stats[DS_MF], E_INT, NULL, | |
| 1420 fd->version); | |
| 1421 | |
| 1422 //fprintf(stderr, "=== TS ===\n"); | |
| 1423 h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), | |
| 1424 c->stats[DS_TS], E_INT, NULL, | |
| 1425 fd->version); | |
| 1426 //fprintf(stderr, "=== NP ===\n"); | |
| 1427 h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), | |
| 1428 c->stats[DS_NP], E_INT, NULL, | |
| 1429 fd->version); | |
| 1430 //fprintf(stderr, "=== NF ===\n"); | |
| 1431 h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), | |
| 1432 c->stats[DS_NF], E_INT, NULL, | |
| 1433 fd->version); | |
| 1434 | |
| 1435 //fprintf(stderr, "=== RL ===\n"); | |
| 1436 h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), | |
| 1437 c->stats[DS_RL], E_INT, NULL, | |
| 1438 fd->version); | |
| 1439 | |
| 1440 //fprintf(stderr, "=== FN ===\n"); | |
| 1441 h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), | |
| 1442 c->stats[DS_FN], E_INT, NULL, | |
| 1443 fd->version); | |
| 1444 | |
| 1445 //fprintf(stderr, "=== FC ===\n"); | |
| 1446 h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), | |
| 1447 c->stats[DS_FC], E_BYTE, NULL, | |
| 1448 fd->version); | |
| 1449 | |
| 1450 //fprintf(stderr, "=== FP ===\n"); | |
| 1451 h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), | |
| 1452 c->stats[DS_FP], E_INT, NULL, | |
| 1453 fd->version); | |
| 1454 | |
| 1455 //fprintf(stderr, "=== DL ===\n"); | |
| 1456 h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), | |
| 1457 c->stats[DS_DL], E_INT, NULL, | |
| 1458 fd->version); | |
| 1459 | |
| 1460 //fprintf(stderr, "=== BA ===\n"); | |
| 1461 h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), | |
| 1462 c->stats[DS_BA], E_BYTE, NULL, | |
| 1463 fd->version); | |
| 1464 | |
| 1465 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
| 1466 cram_byte_array_len_encoder e; | |
| 1467 | |
| 1468 e.len_encoding = E_EXTERNAL; | |
| 1469 e.len_dat = (void *)DS_BB_len; | |
| 1470 //e.len_dat = (void *)DS_BB; | |
| 1471 | |
| 1472 e.val_encoding = E_EXTERNAL; | |
| 1473 e.val_dat = (void *)DS_BB; | |
| 1474 | |
| 1475 h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, | |
| 1476 E_BYTE_ARRAY, (void *)&e, | |
| 1477 fd->version); | |
| 1478 } else { | |
| 1479 h->codecs[DS_BB] = NULL; | |
| 1480 } | |
| 1481 | |
| 1482 //fprintf(stderr, "=== BS ===\n"); | |
| 1483 h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), | |
| 1484 c->stats[DS_BS], E_BYTE, NULL, | |
| 1485 fd->version); | |
| 1486 | |
| 1487 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 1488 h->codecs[DS_TL] = NULL; | |
| 1489 h->codecs[DS_RI] = NULL; | |
| 1490 h->codecs[DS_RS] = NULL; | |
| 1491 h->codecs[DS_PD] = NULL; | |
| 1492 h->codecs[DS_HC] = NULL; | |
| 1493 h->codecs[DS_SC] = NULL; | |
| 1494 | |
| 1495 //fprintf(stderr, "=== TC ===\n"); | |
| 1496 h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), | |
| 1497 c->stats[DS_TC], E_BYTE, NULL, | |
| 1498 fd->version); | |
| 1499 | |
| 1500 //fprintf(stderr, "=== TN ===\n"); | |
| 1501 h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), | |
| 1502 c->stats[DS_TN], E_INT, NULL, | |
| 1503 fd->version); | |
| 1504 } else { | |
| 1505 h->codecs[DS_TC] = NULL; | |
| 1506 h->codecs[DS_TN] = NULL; | |
| 1507 | |
| 1508 //fprintf(stderr, "=== TL ===\n"); | |
| 1509 h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), | |
| 1510 c->stats[DS_TL], E_INT, NULL, | |
| 1511 fd->version); | |
| 1512 | |
| 1513 | |
| 1514 //fprintf(stderr, "=== RI ===\n"); | |
| 1515 h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), | |
| 1516 c->stats[DS_RI], E_INT, NULL, | |
| 1517 fd->version); | |
| 1518 | |
| 1519 //fprintf(stderr, "=== RS ===\n"); | |
| 1520 h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), | |
| 1521 c->stats[DS_RS], E_INT, NULL, | |
| 1522 fd->version); | |
| 1523 | |
| 1524 //fprintf(stderr, "=== PD ===\n"); | |
| 1525 h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), | |
| 1526 c->stats[DS_PD], E_INT, NULL, | |
| 1527 fd->version); | |
| 1528 | |
| 1529 //fprintf(stderr, "=== HC ===\n"); | |
| 1530 h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), | |
| 1531 c->stats[DS_HC], E_INT, NULL, | |
| 1532 fd->version); | |
| 1533 | |
| 1534 //fprintf(stderr, "=== SC ===\n"); | |
| 1535 if (1) { | |
| 1536 int i2[2] = {0, DS_SC}; | |
| 1537 | |
| 1538 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, | |
| 1539 E_BYTE_ARRAY, (void *)i2, | |
| 1540 fd->version); | |
| 1541 } else { | |
| 1542 // Appears to be no practical benefit to using this method, | |
| 1543 // but it may work better if we start mixing SC, IN and BB | |
| 1544 // elements into the same external block. | |
| 1545 cram_byte_array_len_encoder e; | |
| 1546 | |
| 1547 e.len_encoding = E_EXTERNAL; | |
| 1548 e.len_dat = (void *)DS_SC_len; | |
| 1549 | |
| 1550 e.val_encoding = E_EXTERNAL; | |
| 1551 e.val_dat = (void *)DS_SC; | |
| 1552 | |
| 1553 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, | |
| 1554 E_BYTE_ARRAY, (void *)&e, | |
| 1555 fd->version); | |
| 1556 } | |
| 1557 } | |
| 1558 | |
| 1559 //fprintf(stderr, "=== IN ===\n"); | |
| 1560 { | |
| 1561 int i2[2] = {0, DS_IN}; | |
| 1562 h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, | |
| 1563 E_BYTE_ARRAY, (void *)i2, | |
| 1564 fd->version); | |
| 1565 } | |
| 1566 | |
| 1567 h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, | |
| 1568 (void *)DS_QS, | |
| 1569 fd->version); | |
| 1570 { | |
| 1571 int i2[2] = {0, DS_RN}; | |
| 1572 h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, | |
| 1573 E_BYTE_ARRAY, (void *)i2, | |
| 1574 fd->version); | |
| 1575 } | |
| 1576 | |
| 1577 | |
| 1578 /* Encode slices */ | |
| 1579 for (i = 0; i < c->curr_slice; i++) { | |
| 1580 if (fd->verbose) | |
| 1581 fprintf(stderr, "Encode slice %d\n", i); | |
| 1582 if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) | |
| 1583 return -1; | |
| 1584 } | |
| 1585 | |
| 1586 /* Create compression header */ | |
| 1587 { | |
| 1588 h->ref_seq_id = c->ref_seq_id; | |
| 1589 h->ref_seq_start = c->ref_seq_start; | |
| 1590 h->ref_seq_span = c->ref_seq_span; | |
| 1591 h->num_records = c->num_records; | |
| 1592 | |
| 1593 h->mapped_qs_included = 0; // fixme | |
| 1594 h->unmapped_qs_included = 0; // fixme | |
| 1595 // h->... fixme | |
| 1596 memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); | |
| 1597 | |
| 1598 if (!(c_hdr = cram_encode_compression_header(fd, c, h))) | |
| 1599 return -1; | |
| 1600 } | |
| 1601 | |
| 1602 /* Compute landmarks */ | |
| 1603 /* Fill out slice landmarks */ | |
| 1604 c->num_landmarks = c->curr_slice; | |
| 1605 c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark)); | |
| 1606 if (!c->landmark) | |
| 1607 return -1; | |
| 1608 | |
| 1609 /* | |
| 1610 * Slice offset starts after the first block, so we need to simulate | |
| 1611 * writing it to work out the correct offset | |
| 1612 */ | |
| 1613 { | |
| 1614 slice_offset = c_hdr->method == RAW | |
| 1615 ? c_hdr->uncomp_size | |
| 1616 : c_hdr->comp_size; | |
| 1617 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
| 1618 itf8_size(c_hdr->content_id) + | |
| 1619 itf8_size(c_hdr->comp_size) + | |
| 1620 itf8_size(c_hdr->uncomp_size); | |
| 1621 } | |
| 1622 | |
| 1623 c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; | |
| 1624 c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; | |
| 1625 c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; | |
| 1626 for (i = 0; i < c->curr_slice; i++) { | |
| 1627 cram_slice *s = c->slices[i]; | |
| 1628 | |
| 1629 c->num_blocks += s->hdr->num_blocks + 2; | |
| 1630 c->landmark[i] = slice_offset; | |
| 1631 | |
| 1632 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span > | |
| 1633 c->ref_seq_start + c->ref_seq_span) { | |
| 1634 c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span | |
| 1635 - c->ref_seq_start; | |
| 1636 } | |
| 1637 | |
| 1638 slice_offset += s->hdr_block->method == RAW | |
| 1639 ? s->hdr_block->uncomp_size | |
| 1640 : s->hdr_block->comp_size; | |
| 1641 | |
| 1642 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
| 1643 itf8_size(s->hdr_block->content_id) + | |
| 1644 itf8_size(s->hdr_block->comp_size) + | |
| 1645 itf8_size(s->hdr_block->uncomp_size); | |
| 1646 | |
| 1647 for (j = 0; j < s->hdr->num_blocks; j++) { | |
| 1648 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
| 1649 itf8_size(s->block[j]->content_id) + | |
| 1650 itf8_size(s->block[j]->comp_size) + | |
| 1651 itf8_size(s->block[j]->uncomp_size); | |
| 1652 | |
| 1653 slice_offset += s->block[j]->method == RAW | |
| 1654 ? s->block[j]->uncomp_size | |
| 1655 : s->block[j]->comp_size; | |
| 1656 } | |
| 1657 } | |
| 1658 c->length += slice_offset; // just past the final slice | |
| 1659 | |
| 1660 c->comp_hdr_block = c_hdr; | |
| 1661 | |
| 1662 if (c->ref_seq_id >= 0) { | |
| 1663 cram_ref_decr(fd->refs, c->ref_seq_id); | |
| 1664 } | |
| 1665 | |
| 1666 /* Cache references up-front if we have unsorted access patterns */ | |
| 1667 if (!fd->no_ref && c->refs_used) { | |
| 1668 for (i = 0; i < fd->refs->nref; i++) { | |
| 1669 if (c->refs_used[i]) | |
| 1670 cram_ref_decr(fd->refs, i); | |
| 1671 } | |
| 1672 } | |
| 1673 | |
| 1674 return 0; | |
| 1675 } | |
| 1676 | |
| 1677 | |
| 1678 /* | |
| 1679 * Adds a feature code to a read within a slice. For purposes of minimising | |
| 1680 * memory allocations and fragmentation we have one array of features for all | |
| 1681 * reads within the slice. We return the index into this array for this new | |
| 1682 * feature. | |
| 1683 * | |
| 1684 * Returns feature index on success | |
| 1685 * -1 on failure. | |
| 1686 */ | |
| 1687 static int cram_add_feature(cram_container *c, cram_slice *s, | |
| 1688 cram_record *r, cram_feature *f) { | |
| 1689 if (s->nfeatures >= s->afeatures) { | |
| 1690 s->afeatures = s->afeatures ? s->afeatures*2 : 1024; | |
| 1691 s->features = realloc(s->features, s->afeatures*sizeof(*s->features)); | |
| 1692 if (!s->features) | |
| 1693 return -1; | |
| 1694 } | |
| 1695 | |
| 1696 if (!r->nfeature++) { | |
| 1697 r->feature = s->nfeatures; | |
| 1698 cram_stats_add(c->stats[DS_FP], f->X.pos); | |
| 1699 } else { | |
| 1700 cram_stats_add(c->stats[DS_FP], | |
| 1701 f->X.pos - s->features[r->feature + r->nfeature-2].X.pos); | |
| 1702 } | |
| 1703 cram_stats_add(c->stats[DS_FC], f->X.code); | |
| 1704 | |
| 1705 s->features[s->nfeatures++] = *f; | |
| 1706 | |
| 1707 return 0; | |
| 1708 } | |
| 1709 | |
| 1710 static int cram_add_substitution(cram_fd *fd, cram_container *c, | |
| 1711 cram_slice *s, cram_record *r, | |
| 1712 int pos, char base, char qual, char ref) { | |
| 1713 cram_feature f; | |
| 1714 | |
| 1715 // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN | |
| 1716 if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) { | |
| 1717 f.X.pos = pos+1; | |
| 1718 f.X.code = 'X'; | |
| 1719 f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f]; | |
| 1720 cram_stats_add(c->stats[DS_BS], f.X.base); | |
| 1721 } else { | |
| 1722 f.B.pos = pos+1; | |
| 1723 f.B.code = 'B'; | |
| 1724 f.B.base = base; | |
| 1725 f.B.qual = qual; | |
| 1726 cram_stats_add(c->stats[DS_BA], f.B.base); | |
| 1727 cram_stats_add(c->stats[DS_QS], f.B.qual); | |
| 1728 BLOCK_APPEND_CHAR(s->qual_blk, qual); | |
| 1729 } | |
| 1730 return cram_add_feature(c, s, r, &f); | |
| 1731 } | |
| 1732 | |
| 1733 static int cram_add_bases(cram_fd *fd, cram_container *c, | |
| 1734 cram_slice *s, cram_record *r, | |
| 1735 int pos, int len, char *base) { | |
| 1736 cram_feature f; | |
| 1737 | |
| 1738 f.b.pos = pos+1; | |
| 1739 f.b.code = 'b'; | |
| 1740 f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk); | |
| 1741 f.b.len = len; | |
| 1742 | |
| 1743 return cram_add_feature(c, s, r, &f); | |
| 1744 } | |
| 1745 | |
| 1746 static int cram_add_base(cram_fd *fd, cram_container *c, | |
| 1747 cram_slice *s, cram_record *r, | |
| 1748 int pos, char base, char qual) { | |
| 1749 cram_feature f; | |
| 1750 f.B.pos = pos+1; | |
| 1751 f.B.code = 'B'; | |
| 1752 f.B.base = base; | |
| 1753 f.B.qual = qual; | |
| 1754 cram_stats_add(c->stats[DS_BA], base); | |
| 1755 cram_stats_add(c->stats[DS_QS], qual); | |
| 1756 BLOCK_APPEND_CHAR(s->qual_blk, qual); | |
| 1757 return cram_add_feature(c, s, r, &f); | |
| 1758 } | |
| 1759 | |
| 1760 static int cram_add_quality(cram_fd *fd, cram_container *c, | |
| 1761 cram_slice *s, cram_record *r, | |
| 1762 int pos, char qual) { | |
| 1763 cram_feature f; | |
| 1764 f.Q.pos = pos+1; | |
| 1765 f.Q.code = 'Q'; | |
| 1766 f.Q.qual = qual; | |
| 1767 cram_stats_add(c->stats[DS_QS], qual); | |
| 1768 BLOCK_APPEND_CHAR(s->qual_blk, qual); | |
| 1769 return cram_add_feature(c, s, r, &f); | |
| 1770 } | |
| 1771 | |
| 1772 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, | |
| 1773 int pos, int len, char *base) { | |
| 1774 cram_feature f; | |
| 1775 f.D.pos = pos+1; | |
| 1776 f.D.code = 'D'; | |
| 1777 f.D.len = len; | |
| 1778 cram_stats_add(c->stats[DS_DL], len); | |
| 1779 return cram_add_feature(c, s, r, &f); | |
| 1780 } | |
| 1781 | |
| 1782 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, | |
| 1783 int pos, int len, char *base, int version) { | |
| 1784 cram_feature f; | |
| 1785 f.S.pos = pos+1; | |
| 1786 f.S.code = 'S'; | |
| 1787 f.S.len = len; | |
| 1788 switch (CRAM_MAJOR_VERS(version)) { | |
| 1789 case 1: | |
| 1790 f.S.seq_idx = BLOCK_SIZE(s->base_blk); | |
| 1791 BLOCK_APPEND(s->base_blk, base, len); | |
| 1792 BLOCK_APPEND_CHAR(s->base_blk, '\0'); | |
| 1793 break; | |
| 1794 | |
| 1795 case 2: | |
| 1796 default: | |
| 1797 f.S.seq_idx = BLOCK_SIZE(s->soft_blk); | |
| 1798 if (base) { | |
| 1799 BLOCK_APPEND(s->soft_blk, base, len); | |
| 1800 } else { | |
| 1801 int i; | |
| 1802 for (i = 0; i < len; i++) | |
| 1803 BLOCK_APPEND_CHAR(s->soft_blk, 'N'); | |
| 1804 } | |
| 1805 BLOCK_APPEND_CHAR(s->soft_blk, '\0'); | |
| 1806 break; | |
| 1807 | |
| 1808 // default: | |
| 1809 // // v3.0 onwards uses BB data-series | |
| 1810 // f.S.seq_idx = BLOCK_SIZE(s->soft_blk); | |
| 1811 } | |
| 1812 return cram_add_feature(c, s, r, &f); | |
| 1813 } | |
| 1814 | |
| 1815 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, | |
| 1816 int pos, int len, char *base) { | |
| 1817 cram_feature f; | |
| 1818 f.S.pos = pos+1; | |
| 1819 f.S.code = 'H'; | |
| 1820 f.S.len = len; | |
| 1821 cram_stats_add(c->stats[DS_HC], len); | |
| 1822 return cram_add_feature(c, s, r, &f); | |
| 1823 } | |
| 1824 | |
| 1825 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r, | |
| 1826 int pos, int len, char *base) { | |
| 1827 cram_feature f; | |
| 1828 f.S.pos = pos+1; | |
| 1829 f.S.code = 'N'; | |
| 1830 f.S.len = len; | |
| 1831 cram_stats_add(c->stats[DS_RS], len); | |
| 1832 return cram_add_feature(c, s, r, &f); | |
| 1833 } | |
| 1834 | |
| 1835 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r, | |
| 1836 int pos, int len, char *base) { | |
| 1837 cram_feature f; | |
| 1838 f.S.pos = pos+1; | |
| 1839 f.S.code = 'P'; | |
| 1840 f.S.len = len; | |
| 1841 cram_stats_add(c->stats[DS_PD], len); | |
| 1842 return cram_add_feature(c, s, r, &f); | |
| 1843 } | |
| 1844 | |
| 1845 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, | |
| 1846 int pos, int len, char *base) { | |
| 1847 cram_feature f; | |
| 1848 f.I.pos = pos+1; | |
| 1849 if (len == 1) { | |
| 1850 char b = base ? *base : 'N'; | |
| 1851 f.i.code = 'i'; | |
| 1852 f.i.base = b; | |
| 1853 cram_stats_add(c->stats[DS_BA], b); | |
| 1854 } else { | |
| 1855 f.I.code = 'I'; | |
| 1856 f.I.len = len; | |
| 1857 f.S.seq_idx = BLOCK_SIZE(s->base_blk); | |
| 1858 if (base) { | |
| 1859 BLOCK_APPEND(s->base_blk, base, len); | |
| 1860 } else { | |
| 1861 int i; | |
| 1862 for (i = 0; i < len; i++) | |
| 1863 BLOCK_APPEND_CHAR(s->base_blk, 'N'); | |
| 1864 } | |
| 1865 BLOCK_APPEND_CHAR(s->base_blk, '\0'); | |
| 1866 } | |
| 1867 return cram_add_feature(c, s, r, &f); | |
| 1868 } | |
| 1869 | |
| 1870 /* | |
| 1871 * Encodes auxiliary data. | |
| 1872 * Returns the read-group parsed out of the BAM aux fields on success | |
| 1873 * NULL on failure or no rg present (FIXME) | |
| 1874 */ | |
| 1875 static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, | |
| 1876 cram_slice *s, cram_record *cr) { | |
| 1877 char *aux, *tmp, *rg = NULL; | |
| 1878 int aux_size = bam_blk_size(b) - | |
| 1879 ((char *)bam_aux(b) - (char *)&bam_ref(b)); | |
| 1880 | |
| 1881 /* Worst case is 1 nul char on every ??:Z: string, so +33% */ | |
| 1882 BLOCK_GROW(s->aux_blk, aux_size*1.34+1); | |
| 1883 tmp = (char *)BLOCK_END(s->aux_blk); | |
| 1884 | |
| 1885 aux = (char *)bam_aux(b); | |
| 1886 cr->TN_idx = s->nTN; | |
| 1887 | |
| 1888 while (aux[0] != 0) { | |
| 1889 int32_t i32; | |
| 1890 int r; | |
| 1891 | |
| 1892 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { | |
| 1893 rg = &aux[3]; | |
| 1894 while (*aux++); | |
| 1895 continue; | |
| 1896 } | |
| 1897 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { | |
| 1898 while (*aux++); | |
| 1899 continue; | |
| 1900 } | |
| 1901 if (aux[0] == 'N' && aux[1] == 'M') { | |
| 1902 switch(aux[2]) { | |
| 1903 case 'A': case 'C': case 'c': aux+=4; break; | |
| 1904 case 'I': case 'i': case 'f': aux+=7; break; | |
| 1905 default: | |
| 1906 fprintf(stderr, "Unhandled type code for NM tag\n"); | |
| 1907 return NULL; | |
| 1908 } | |
| 1909 continue; | |
| 1910 } | |
| 1911 | |
| 1912 cr->ntags++; | |
| 1913 | |
| 1914 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; | |
| 1915 kh_put(s_i2i, c->tags_used, i32, &r); | |
| 1916 if (-1 == r) | |
| 1917 return NULL; | |
| 1918 | |
| 1919 if (s->nTN >= s->aTN) { | |
| 1920 s->aTN = s->aTN ? s->aTN*2 : 1024; | |
| 1921 if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) | |
| 1922 return NULL; | |
| 1923 } | |
| 1924 s->TN[s->nTN++] = i32; | |
| 1925 cram_stats_add(c->stats[DS_TN], i32); | |
| 1926 | |
| 1927 switch(aux[2]) { | |
| 1928 case 'A': case 'C': case 'c': | |
| 1929 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1930 *tmp++=*aux++; | |
| 1931 break; | |
| 1932 | |
| 1933 case 'S': case 's': | |
| 1934 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1935 *tmp++=*aux++; *tmp++=*aux++; | |
| 1936 break; | |
| 1937 | |
| 1938 case 'I': case 'i': case 'f': | |
| 1939 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1940 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1941 break; | |
| 1942 | |
| 1943 case 'd': | |
| 1944 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1945 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1946 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1947 break; | |
| 1948 | |
| 1949 case 'Z': case 'H': | |
| 1950 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1951 while ((*tmp++=*aux++)); | |
| 1952 *tmp++ = '\t'; // stop byte | |
| 1953 break; | |
| 1954 | |
| 1955 case 'B': { | |
| 1956 int type = aux[3], blen; | |
| 1957 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + | |
| 1958 (((unsigned char *)aux)[5]<< 8) + | |
| 1959 (((unsigned char *)aux)[6]<<16) + | |
| 1960 (((unsigned char *)aux)[7]<<24)); | |
| 1961 // skip TN field | |
| 1962 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1963 | |
| 1964 // We use BYTE_ARRAY_LEN with external length, so store that first | |
| 1965 switch (type) { | |
| 1966 case 'c': case 'C': | |
| 1967 blen = count; | |
| 1968 break; | |
| 1969 case 's': case 'S': | |
| 1970 blen = 2*count; | |
| 1971 break; | |
| 1972 case 'i': case 'I': case 'f': | |
| 1973 blen = 4*count; | |
| 1974 break; | |
| 1975 default: | |
| 1976 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", | |
| 1977 type); | |
| 1978 return NULL; | |
| 1979 | |
| 1980 } | |
| 1981 | |
| 1982 tmp += itf8_put(tmp, blen+5); | |
| 1983 | |
| 1984 *tmp++=*aux++; // sub-type & length | |
| 1985 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 1986 | |
| 1987 // The tag data itself | |
| 1988 memcpy(tmp, aux, blen); tmp += blen; aux += blen; | |
| 1989 | |
| 1990 //cram_stats_add(c->aux_B_stats, blen); | |
| 1991 break; | |
| 1992 } | |
| 1993 default: | |
| 1994 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); | |
| 1995 return NULL; | |
| 1996 } | |
| 1997 } | |
| 1998 cram_stats_add(c->stats[DS_TC], cr->ntags); | |
| 1999 | |
| 2000 cr->aux = BLOCK_SIZE(s->aux_blk); | |
| 2001 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); | |
| 2002 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); | |
| 2003 assert(s->aux_blk->byte <= s->aux_blk->alloc); | |
| 2004 | |
| 2005 return rg; | |
| 2006 } | |
| 2007 | |
| 2008 /* | |
| 2009 * Encodes auxiliary data. Largely duplicated from above, but done so to | |
| 2010 * keep it simple and avoid a myriad of version ifs. | |
| 2011 * | |
| 2012 * Returns the read-group parsed out of the BAM aux fields on success | |
| 2013 * NULL on failure or no rg present (FIXME) | |
| 2014 */ | |
| 2015 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, | |
| 2016 cram_slice *s, cram_record *cr) { | |
| 2017 char *aux, *orig, *tmp, *rg = NULL; | |
| 2018 int aux_size = bam_get_l_aux(b); | |
| 2019 cram_block *td_b = c->comp_hdr->TD_blk; | |
| 2020 int TD_blk_size = BLOCK_SIZE(td_b), new; | |
| 2021 char *key; | |
| 2022 khint_t k; | |
| 2023 | |
| 2024 | |
| 2025 /* Worst case is 1 nul char on every ??:Z: string, so +33% */ | |
| 2026 BLOCK_GROW(s->aux_blk, aux_size*1.34+1); | |
| 2027 tmp = (char *)BLOCK_END(s->aux_blk); | |
| 2028 | |
| 2029 | |
| 2030 orig = aux = (char *)bam_aux(b); | |
| 2031 | |
| 2032 // Copy aux keys to td_b and aux values to s->aux_blk | |
| 2033 while (aux - orig < aux_size && aux[0] != 0) { | |
| 2034 uint32_t i32; | |
| 2035 int r; | |
| 2036 | |
| 2037 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { | |
| 2038 rg = &aux[3]; | |
| 2039 while (*aux++); | |
| 2040 continue; | |
| 2041 } | |
| 2042 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { | |
| 2043 while (*aux++); | |
| 2044 continue; | |
| 2045 } | |
| 2046 if (aux[0] == 'N' && aux[1] == 'M') { | |
| 2047 switch(aux[2]) { | |
| 2048 case 'A': case 'C': case 'c': aux+=4; break; | |
| 2049 case 'S': case 's': aux+=5; break; | |
| 2050 case 'I': case 'i': case 'f': aux+=7; break; | |
| 2051 default: | |
| 2052 fprintf(stderr, "Unhandled type code for NM tag\n"); | |
| 2053 return NULL; | |
| 2054 } | |
| 2055 continue; | |
| 2056 } | |
| 2057 | |
| 2058 BLOCK_APPEND(td_b, aux, 3); | |
| 2059 | |
| 2060 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; | |
| 2061 kh_put(s_i2i, c->tags_used, i32, &r); | |
| 2062 if (-1 == r) | |
| 2063 return NULL; | |
| 2064 | |
| 2065 // BQ:Z | |
| 2066 if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') { | |
| 2067 char *tmp; | |
| 2068 if (!s->aux_BQ_blk) | |
| 2069 if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ))) | |
| 2070 return NULL; | |
| 2071 BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1); | |
| 2072 tmp = (char *)BLOCK_END(s->aux_BQ_blk); | |
| 2073 aux += 3; | |
| 2074 while ((*tmp++=*aux++)); | |
| 2075 *tmp++ = '\t'; | |
| 2076 BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk); | |
| 2077 continue; | |
| 2078 } | |
| 2079 | |
| 2080 // BD:Z | |
| 2081 if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') { | |
| 2082 char *tmp; | |
| 2083 if (!s->aux_BD_blk) | |
| 2084 if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD))) | |
| 2085 return NULL; | |
| 2086 BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1); | |
| 2087 tmp = (char *)BLOCK_END(s->aux_BD_blk); | |
| 2088 aux += 3; | |
| 2089 while ((*tmp++=*aux++)); | |
| 2090 *tmp++ = '\t'; | |
| 2091 BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk); | |
| 2092 continue; | |
| 2093 } | |
| 2094 | |
| 2095 // BI:Z | |
| 2096 if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') { | |
| 2097 char *tmp; | |
| 2098 if (!s->aux_BI_blk) | |
| 2099 if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI))) | |
| 2100 return NULL; | |
| 2101 BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1); | |
| 2102 tmp = (char *)BLOCK_END(s->aux_BI_blk); | |
| 2103 aux += 3; | |
| 2104 while ((*tmp++=*aux++)); | |
| 2105 *tmp++ = '\t'; | |
| 2106 BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk); | |
| 2107 continue; | |
| 2108 } | |
| 2109 | |
| 2110 // OQ:Z: | |
| 2111 if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') { | |
| 2112 char *tmp; | |
| 2113 if (!s->aux_OQ_blk) | |
| 2114 if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ))) | |
| 2115 return NULL; | |
| 2116 BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1); | |
| 2117 tmp = (char *)BLOCK_END(s->aux_OQ_blk); | |
| 2118 aux += 3; | |
| 2119 while ((*tmp++=*aux++)); | |
| 2120 *tmp++ = '\t'; | |
| 2121 BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk); | |
| 2122 continue; | |
| 2123 } | |
| 2124 | |
| 2125 // FZ:B or ZM:B | |
| 2126 if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') || | |
| 2127 (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) { | |
| 2128 int type = aux[3], blen; | |
| 2129 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + | |
| 2130 (((unsigned char *)aux)[5]<< 8) + | |
| 2131 (((unsigned char *)aux)[6]<<16) + | |
| 2132 (((unsigned char *)aux)[7]<<24)); | |
| 2133 char *tmp; | |
| 2134 if (!s->aux_FZ_blk) | |
| 2135 if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ))) | |
| 2136 return NULL; | |
| 2137 BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1); | |
| 2138 tmp = (char *)BLOCK_END(s->aux_FZ_blk); | |
| 2139 | |
| 2140 // skip TN field | |
| 2141 aux+=3; | |
| 2142 | |
| 2143 // We use BYTE_ARRAY_LEN with external length, so store that first | |
| 2144 switch (type) { | |
| 2145 case 'c': case 'C': | |
| 2146 blen = count; | |
| 2147 break; | |
| 2148 case 's': case 'S': | |
| 2149 blen = 2*count; | |
| 2150 break; | |
| 2151 case 'i': case 'I': case 'f': | |
| 2152 blen = 4*count; | |
| 2153 break; | |
| 2154 default: | |
| 2155 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", | |
| 2156 type); | |
| 2157 return NULL; | |
| 2158 | |
| 2159 } | |
| 2160 | |
| 2161 blen += 5; // sub-type & length | |
| 2162 tmp += itf8_put(tmp, blen); | |
| 2163 | |
| 2164 // The tag data itself | |
| 2165 memcpy(tmp, aux, blen); tmp += blen; aux += blen; | |
| 2166 | |
| 2167 BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk); | |
| 2168 continue; | |
| 2169 } | |
| 2170 | |
| 2171 // Other quality data - {Q2,E2,U2,CQ}:Z and similar | |
| 2172 if (((aux[0] == 'Q' && aux[1] == '2') || | |
| 2173 (aux[0] == 'U' && aux[1] == '2') || | |
| 2174 (aux[0] == 'Q' && aux[1] == 'T') || | |
| 2175 (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') { | |
| 2176 char *tmp; | |
| 2177 if (!s->aux_oq_blk) | |
| 2178 if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq))) | |
| 2179 return NULL; | |
| 2180 BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1); | |
| 2181 tmp = (char *)BLOCK_END(s->aux_oq_blk); | |
| 2182 aux += 3; | |
| 2183 while ((*tmp++=*aux++)); | |
| 2184 *tmp++ = '\t'; | |
| 2185 BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk); | |
| 2186 continue; | |
| 2187 } | |
| 2188 | |
| 2189 // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar | |
| 2190 if (((aux[0] == 'R' && aux[1] == '2') || | |
| 2191 (aux[0] == 'E' && aux[1] == '2') || | |
| 2192 (aux[0] == 'C' && aux[1] == 'S') || | |
| 2193 (aux[0] == 'B' && aux[1] == 'C') || | |
| 2194 (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') { | |
| 2195 char *tmp; | |
| 2196 if (!s->aux_os_blk) | |
| 2197 if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os))) | |
| 2198 return NULL; | |
| 2199 BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1); | |
| 2200 tmp = (char *)BLOCK_END(s->aux_os_blk); | |
| 2201 aux += 3; | |
| 2202 while ((*tmp++=*aux++)); | |
| 2203 *tmp++ = '\t'; | |
| 2204 BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk); | |
| 2205 continue; | |
| 2206 } | |
| 2207 | |
| 2208 | |
| 2209 switch(aux[2]) { | |
| 2210 case 'A': case 'C': case 'c': | |
| 2211 aux+=3; | |
| 2212 *tmp++=*aux++; | |
| 2213 break; | |
| 2214 | |
| 2215 case 'S': case 's': | |
| 2216 aux+=3; | |
| 2217 *tmp++=*aux++; *tmp++=*aux++; | |
| 2218 break; | |
| 2219 | |
| 2220 case 'I': case 'i': case 'f': | |
| 2221 aux+=3; | |
| 2222 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 2223 break; | |
| 2224 | |
| 2225 case 'd': | |
| 2226 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 2227 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 2228 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
| 2229 break; | |
| 2230 | |
| 2231 case 'Z': case 'H': | |
| 2232 { | |
| 2233 char *tmp; | |
| 2234 if (!s->aux_oz_blk) | |
| 2235 if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz))) | |
| 2236 return NULL; | |
| 2237 BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1); | |
| 2238 tmp = (char *)BLOCK_END(s->aux_oz_blk); | |
| 2239 aux += 3; | |
| 2240 while ((*tmp++=*aux++)); | |
| 2241 *tmp++ = '\t'; | |
| 2242 BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp - | |
| 2243 BLOCK_DATA(s->aux_oz_blk); | |
| 2244 } | |
| 2245 break; | |
| 2246 | |
| 2247 case 'B': { | |
| 2248 int type = aux[3], blen; | |
| 2249 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + | |
| 2250 (((unsigned char *)aux)[5]<< 8) + | |
| 2251 (((unsigned char *)aux)[6]<<16) + | |
| 2252 (((unsigned char *)aux)[7]<<24)); | |
| 2253 // skip TN field | |
| 2254 aux+=3; | |
| 2255 | |
| 2256 // We use BYTE_ARRAY_LEN with external length, so store that first | |
| 2257 switch (type) { | |
| 2258 case 'c': case 'C': | |
| 2259 blen = count; | |
| 2260 break; | |
| 2261 case 's': case 'S': | |
| 2262 blen = 2*count; | |
| 2263 break; | |
| 2264 case 'i': case 'I': case 'f': | |
| 2265 blen = 4*count; | |
| 2266 break; | |
| 2267 default: | |
| 2268 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", | |
| 2269 type); | |
| 2270 return NULL; | |
| 2271 | |
| 2272 } | |
| 2273 | |
| 2274 blen += 5; // sub-type & length | |
| 2275 tmp += itf8_put(tmp, blen); | |
| 2276 | |
| 2277 // The tag data itself | |
| 2278 memcpy(tmp, aux, blen); tmp += blen; aux += blen; | |
| 2279 | |
| 2280 //cram_stats_add(c->aux_B_stats, blen); | |
| 2281 break; | |
| 2282 } | |
| 2283 default: | |
| 2284 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); | |
| 2285 return NULL; | |
| 2286 } | |
| 2287 } | |
| 2288 | |
| 2289 // FIXME: sort BLOCK_DATA(td_b) by char[3] triples | |
| 2290 | |
| 2291 // And and increment TD hash entry | |
| 2292 BLOCK_APPEND_CHAR(td_b, 0); | |
| 2293 | |
| 2294 // Duplicate key as BLOCK_DATA() can be realloced to a new pointer. | |
| 2295 key = string_ndup(c->comp_hdr->TD_keys, | |
| 2296 (char *)BLOCK_DATA(td_b) + TD_blk_size, | |
| 2297 BLOCK_SIZE(td_b) - TD_blk_size); | |
| 2298 k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new); | |
| 2299 if (new < 0) { | |
| 2300 return NULL; | |
| 2301 } else if (new == 0) { | |
| 2302 BLOCK_SIZE(td_b) = TD_blk_size; | |
| 2303 } else { | |
| 2304 kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL; | |
| 2305 c->comp_hdr->nTL++; | |
| 2306 } | |
| 2307 | |
| 2308 cr->TL = kh_val(c->comp_hdr->TD_hash, k); | |
| 2309 cram_stats_add(c->stats[DS_TL], cr->TL); | |
| 2310 | |
| 2311 cr->aux = BLOCK_SIZE(s->aux_blk); | |
| 2312 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); | |
| 2313 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); | |
| 2314 assert(s->aux_blk->byte <= s->aux_blk->alloc); | |
| 2315 | |
| 2316 return rg; | |
| 2317 } | |
| 2318 | |
| 2319 | |
| 2320 /* | |
| 2321 * Handles creation of a new container or new slice, flushing any | |
| 2322 * existing containers when appropriate. | |
| 2323 * | |
| 2324 * Really this is next slice, which may or may not lead to a new container. | |
| 2325 * | |
| 2326 * Returns cram_container pointer on success | |
| 2327 * NULL on failure. | |
| 2328 */ | |
| 2329 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { | |
| 2330 cram_container *c = fd->ctr; | |
| 2331 cram_slice *s; | |
| 2332 int i; | |
| 2333 | |
| 2334 /* First occurence */ | |
| 2335 if (c->curr_ref == -2) | |
| 2336 c->curr_ref = bam_ref(b); | |
| 2337 | |
| 2338 if (c->slice) { | |
| 2339 s = c->slice; | |
| 2340 if (c->multi_seq) { | |
| 2341 s->hdr->ref_seq_id = -2; | |
| 2342 s->hdr->ref_seq_start = 0; | |
| 2343 s->hdr->ref_seq_span = 0; | |
| 2344 } else { | |
| 2345 s->hdr->ref_seq_id = c->curr_ref; | |
| 2346 s->hdr->ref_seq_start = c->first_base; | |
| 2347 s->hdr->ref_seq_span = c->last_base - c->first_base + 1; | |
| 2348 } | |
| 2349 s->hdr->num_records = c->curr_rec; | |
| 2350 | |
| 2351 if (c->curr_slice == 0) { | |
| 2352 if (c->ref_seq_id != s->hdr->ref_seq_id) | |
| 2353 c->ref_seq_id = s->hdr->ref_seq_id; | |
| 2354 c->ref_seq_start = c->first_base; | |
| 2355 } | |
| 2356 | |
| 2357 c->curr_slice++; | |
| 2358 } | |
| 2359 | |
| 2360 /* Flush container */ | |
| 2361 if (c->curr_slice == c->max_slice || | |
| 2362 (bam_ref(b) != c->curr_ref && !c->multi_seq)) { | |
| 2363 c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; | |
| 2364 if (fd->verbose) | |
| 2365 fprintf(stderr, "Flush container %d/%d..%d\n", | |
| 2366 c->ref_seq_id, c->ref_seq_start, | |
| 2367 c->ref_seq_start + c->ref_seq_span -1); | |
| 2368 | |
| 2369 /* Encode slices */ | |
| 2370 if (fd->pool) { | |
| 2371 if (-1 == cram_flush_container_mt(fd, c)) | |
| 2372 return NULL; | |
| 2373 } else { | |
| 2374 if (-1 == cram_flush_container(fd, c)) | |
| 2375 return NULL; | |
| 2376 | |
| 2377 // Move to sep func, as we need cram_flush_container for | |
| 2378 // the closing phase to flush the partial container. | |
| 2379 for (i = 0; i < c->max_slice; i++) { | |
| 2380 cram_free_slice(c->slices[i]); | |
| 2381 c->slices[i] = NULL; | |
| 2382 } | |
| 2383 | |
| 2384 c->slice = NULL; | |
| 2385 c->curr_slice = 0; | |
| 2386 | |
| 2387 /* Easy approach for purposes of freeing stats */ | |
| 2388 cram_free_container(c); | |
| 2389 } | |
| 2390 | |
| 2391 c = fd->ctr = cram_new_container(fd->seqs_per_slice, | |
| 2392 fd->slices_per_container); | |
| 2393 if (!c) | |
| 2394 return NULL; | |
| 2395 c->record_counter = fd->record_counter; | |
| 2396 c->curr_ref = bam_ref(b); | |
| 2397 } | |
| 2398 | |
| 2399 c->last_pos = c->first_base = c->last_base = bam_pos(b)+1; | |
| 2400 | |
| 2401 /* New slice */ | |
| 2402 c->slice = c->slices[c->curr_slice] = | |
| 2403 cram_new_slice(MAPPED_SLICE, c->max_rec); | |
| 2404 if (!c->slice) | |
| 2405 return NULL; | |
| 2406 | |
| 2407 if (c->multi_seq) { | |
| 2408 c->slice->hdr->ref_seq_id = -2; | |
| 2409 c->slice->hdr->ref_seq_start = 0; | |
| 2410 c->slice->last_apos = 1; | |
| 2411 } else { | |
| 2412 c->slice->hdr->ref_seq_id = bam_ref(b); | |
| 2413 // wrong for unsorted data, will fix during encoding. | |
| 2414 c->slice->hdr->ref_seq_start = bam_pos(b)+1; | |
| 2415 c->slice->last_apos = bam_pos(b)+1; | |
| 2416 } | |
| 2417 | |
| 2418 c->curr_rec = 0; | |
| 2419 | |
| 2420 return c; | |
| 2421 } | |
| 2422 | |
| 2423 /* | |
| 2424 * Converts a single bam record into a cram record. | |
| 2425 * Possibly used within a thread. | |
| 2426 * | |
| 2427 * Returns 0 on success; | |
| 2428 * -1 on failure | |
| 2429 */ | |
| 2430 static int process_one_read(cram_fd *fd, cram_container *c, | |
| 2431 cram_slice *s, cram_record *cr, | |
| 2432 bam_seq_t *b, int rnum) { | |
| 2433 int i, fake_qual = -1; | |
| 2434 char *cp, *rg; | |
| 2435 char *ref, *seq, *qual; | |
| 2436 | |
| 2437 // FIXME: multi-ref containers | |
| 2438 | |
| 2439 ref = c->ref; | |
| 2440 cr->len = bam_seq_len(b); cram_stats_add(c->stats[DS_RL], cr->len); | |
| 2441 | |
| 2442 //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); | |
| 2443 | |
| 2444 // Fields to resolve later | |
| 2445 //cr->mate_line; // index to another cram_record | |
| 2446 //cr->mate_flags; // MF | |
| 2447 //cr->ntags; // TC | |
| 2448 cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); | |
| 2449 if (CRAM_MAJOR_VERS(fd->version) == 1) | |
| 2450 rg = cram_encode_aux_1_0(fd, b, c, s, cr); | |
| 2451 else | |
| 2452 rg = cram_encode_aux(fd, b, c, s, cr); | |
| 2453 | |
| 2454 //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b)); | |
| 2455 //cr->aux = DSTRING_LEN(s->aux_ds); | |
| 2456 //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size); | |
| 2457 | |
| 2458 /* Read group, identified earlier */ | |
| 2459 if (rg) { | |
| 2460 SAM_RG *brg = sam_hdr_find_rg(fd->header, rg); | |
| 2461 cr->rg = brg ? brg->id : -1; | |
| 2462 } else if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
| 2463 SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN"); | |
| 2464 assert(brg); | |
| 2465 } else { | |
| 2466 cr->rg = -1; | |
| 2467 } | |
| 2468 cram_stats_add(c->stats[DS_RG], cr->rg); | |
| 2469 | |
| 2470 | |
| 2471 cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id); | |
| 2472 cr->flags = bam_flag(b); | |
| 2473 if (bam_cigar_len(b) == 0) | |
| 2474 cr->flags |= BAM_FUNMAP; | |
| 2475 cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]); | |
| 2476 | |
| 2477 // Non reference based encoding means storing the bases verbatim as features, which in | |
| 2478 // turn means every base also has a quality already stored. | |
| 2479 if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) | |
| 2480 cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES; | |
| 2481 else | |
| 2482 cr->cram_flags = 0; | |
| 2483 //cram_stats_add(c->stats[DS_CF], cr->cram_flags); | |
| 2484 | |
| 2485 c->num_bases += cr->len; | |
| 2486 cr->apos = bam_pos(b)+1; | |
| 2487 if (c->pos_sorted) { | |
| 2488 if (cr->apos < s->last_apos) { | |
| 2489 c->pos_sorted = 0; | |
| 2490 } else { | |
| 2491 cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos); | |
| 2492 s->last_apos = cr->apos; | |
| 2493 } | |
| 2494 } else { | |
| 2495 //cram_stats_add(c->stats[DS_AP], cr->apos); | |
| 2496 } | |
| 2497 c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos); | |
| 2498 | |
| 2499 cr->name = BLOCK_SIZE(s->name_blk); | |
| 2500 cr->name_len = bam_name_len(b); | |
| 2501 cram_stats_add(c->stats[DS_RN], cr->name_len); | |
| 2502 | |
| 2503 BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); | |
| 2504 | |
| 2505 | |
| 2506 /* | |
| 2507 * This seqs_ds is largely pointless and it could reuse the same memory | |
| 2508 * over and over. | |
| 2509 * s->base_blk is what we need for encoding. | |
| 2510 */ | |
| 2511 cr->seq = BLOCK_SIZE(s->seqs_blk); | |
| 2512 cr->qual = BLOCK_SIZE(s->qual_blk); | |
| 2513 BLOCK_GROW(s->seqs_blk, cr->len+1); | |
| 2514 BLOCK_GROW(s->qual_blk, cr->len); | |
| 2515 seq = cp = (char *)BLOCK_END(s->seqs_blk); | |
| 2516 | |
| 2517 *seq = 0; | |
| 2518 #ifdef ALLOW_UAC | |
| 2519 { | |
| 2520 // Convert seq 2 bases at a time for speed. | |
| 2521 static const uint16_t code2base[256] = { | |
| 2522 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077, | |
| 2523 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029, | |
| 2524 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081, | |
| 2525 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033, | |
| 2526 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083, | |
| 2527 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035, | |
| 2528 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093, | |
| 2529 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045, | |
| 2530 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087, | |
| 2531 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039, | |
| 2532 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098, | |
| 2533 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050, | |
| 2534 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099, | |
| 2535 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051, | |
| 2536 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102, | |
| 2537 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054, | |
| 2538 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100, | |
| 2539 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052, | |
| 2540 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103, | |
| 2541 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055, | |
| 2542 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105, | |
| 2543 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057, | |
| 2544 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088, | |
| 2545 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040, | |
| 2546 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091, | |
| 2547 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043, | |
| 2548 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084, | |
| 2549 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036, | |
| 2550 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082, | |
| 2551 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034, | |
| 2552 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094, | |
| 2553 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046 | |
| 2554 }; | |
| 2555 | |
| 2556 int l2 = cr->len / 2; | |
| 2557 unsigned char *from = (unsigned char *)bam_seq(b); | |
| 2558 uint16_t *cpi = (uint16_t *)cp; | |
| 2559 cp[0] = 0; | |
| 2560 for (i = 0; i < l2; i++) | |
| 2561 cpi[i] = le_int2(code2base[from[i]]); | |
| 2562 if ((i *= 2) < cr->len) | |
| 2563 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; | |
| 2564 } | |
| 2565 #else | |
| 2566 for (i = 0; i < cr->len; i++) | |
| 2567 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; | |
| 2568 #endif | |
| 2569 BLOCK_SIZE(s->seqs_blk) += cr->len; | |
| 2570 | |
| 2571 qual = cp = (char *)bam_qual(b); | |
| 2572 | |
| 2573 /* Copy and parse */ | |
| 2574 if (!(cr->flags & BAM_FUNMAP)) { | |
| 2575 int32_t *cig_to, *cig_from; | |
| 2576 int apos = cr->apos-1, spos = 0; | |
| 2577 | |
| 2578 cr->cigar = s->ncigar; | |
| 2579 cr->ncigar = bam_cigar_len(b); | |
| 2580 while (cr->cigar + cr->ncigar >= s->cigar_alloc) { | |
| 2581 s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024; | |
| 2582 s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar)); | |
| 2583 if (!s->cigar) | |
| 2584 return -1; | |
| 2585 } | |
| 2586 | |
| 2587 cig_to = (int32_t *)s->cigar; | |
| 2588 cig_from = (int32_t *)bam_cigar(b); | |
| 2589 | |
| 2590 cr->feature = 0; | |
| 2591 cr->nfeature = 0; | |
| 2592 for (i = 0; i < cr->ncigar; i++) { | |
| 2593 enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK; | |
| 2594 int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT; | |
| 2595 cig_to[i] = cig_from[i]; | |
| 2596 | |
| 2597 /* Can also generate events from here for CRAM diffs */ | |
| 2598 | |
| 2599 switch (cig_op) { | |
| 2600 int l; | |
| 2601 | |
| 2602 // Don't trust = and X ops to be correct. | |
| 2603 case BAM_CMATCH: | |
| 2604 case BAM_CBASE_MATCH: | |
| 2605 case BAM_CBASE_MISMATCH: | |
| 2606 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", | |
| 2607 // cig_len, &ref[apos], cig_len, &seq[spos]); | |
| 2608 l = 0; | |
| 2609 if (!fd->no_ref && cr->len) { | |
| 2610 int end = cig_len+apos < c->ref_end | |
| 2611 ? cig_len : c->ref_end - apos; | |
| 2612 char *sp = &seq[spos]; | |
| 2613 char *rp = &ref[apos]; | |
| 2614 char *qp = &qual[spos]; | |
| 2615 for (l = 0; l < end; l++) { | |
| 2616 if (rp[l] != sp[l]) { | |
| 2617 if (!sp[l]) | |
| 2618 break; | |
| 2619 if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { | |
| 2620 // Disabled for the time being as it doesn't | |
| 2621 // seem to gain us much. | |
| 2622 int ol=l; | |
| 2623 while (l<end && rp[l] != sp[l]) | |
| 2624 l++; | |
| 2625 if (l-ol > 1) { | |
| 2626 if (cram_add_bases(fd, c, s, cr, spos+ol, | |
| 2627 l-ol, &seq[spos+ol])) | |
| 2628 return -1; | |
| 2629 l--; | |
| 2630 } else { | |
| 2631 l = ol; | |
| 2632 if (cram_add_substitution(fd, c, s, cr, | |
| 2633 spos+l, sp[l], | |
| 2634 qp[l], rp[l])) | |
| 2635 return -1; | |
| 2636 } | |
| 2637 } else { | |
| 2638 if (cram_add_substitution(fd, c, s, cr, spos+l, | |
| 2639 sp[l], qp[l], rp[l])) | |
| 2640 return -1; | |
| 2641 } | |
| 2642 } | |
| 2643 } | |
| 2644 spos += l; | |
| 2645 apos += l; | |
| 2646 } | |
| 2647 | |
| 2648 if (l < cig_len && cr->len) { | |
| 2649 if (fd->no_ref) { | |
| 2650 if (CRAM_MAJOR_VERS(fd->version) == 3) { | |
| 2651 if (cram_add_bases(fd, c, s, cr, spos, | |
| 2652 cig_len-l, &seq[spos])) | |
| 2653 return -1; | |
| 2654 spos += cig_len-l; | |
| 2655 } else { | |
| 2656 for (; l < cig_len && seq[spos]; l++, spos++) { | |
| 2657 if (cram_add_base(fd, c, s, cr, spos, | |
| 2658 seq[spos], qual[spos])) | |
| 2659 return -1; | |
| 2660 } | |
| 2661 } | |
| 2662 } else { | |
| 2663 /* off end of sequence or non-ref based output */ | |
| 2664 for (; l < cig_len && seq[spos]; l++, spos++) { | |
| 2665 if (cram_add_base(fd, c, s, cr, spos, | |
| 2666 seq[spos], qual[spos])) | |
| 2667 return -1; | |
| 2668 } | |
| 2669 } | |
| 2670 apos += cig_len; | |
| 2671 } else if (!cr->len) { | |
| 2672 /* Seq "*" */ | |
| 2673 apos += cig_len; | |
| 2674 spos += cig_len; | |
| 2675 } | |
| 2676 break; | |
| 2677 | |
| 2678 case BAM_CDEL: | |
| 2679 if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos])) | |
| 2680 return -1; | |
| 2681 apos += cig_len; | |
| 2682 break; | |
| 2683 | |
| 2684 case BAM_CREF_SKIP: | |
| 2685 if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos])) | |
| 2686 return -1; | |
| 2687 apos += cig_len; | |
| 2688 break; | |
| 2689 | |
| 2690 case BAM_CINS: | |
| 2691 if (cram_add_insertion(c, s, cr, spos, cig_len, | |
| 2692 cr->len ? &seq[spos] : NULL)) | |
| 2693 return -1; | |
| 2694 if (fd->no_ref && cr->len) { | |
| 2695 for (l = 0; l < cig_len; l++, spos++) { | |
| 2696 cram_add_quality(fd, c, s, cr, spos, qual[spos]); | |
| 2697 } | |
| 2698 } else { | |
| 2699 spos += cig_len; | |
| 2700 } | |
| 2701 break; | |
| 2702 | |
| 2703 case BAM_CSOFT_CLIP: | |
| 2704 if (cram_add_softclip(c, s, cr, spos, cig_len, | |
| 2705 cr->len ? &seq[spos] : NULL, | |
| 2706 fd->version)) | |
| 2707 return -1; | |
| 2708 if (fd->no_ref && | |
| 2709 !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { | |
| 2710 if (cr->len) { | |
| 2711 for (l = 0; l < cig_len; l++, spos++) { | |
| 2712 cram_add_quality(fd, c, s, cr, spos, qual[spos]); | |
| 2713 } | |
| 2714 } else { | |
| 2715 for (l = 0; l < cig_len; l++, spos++) { | |
| 2716 cram_add_quality(fd, c, s, cr, spos, -1); | |
| 2717 } | |
| 2718 } | |
| 2719 } else { | |
| 2720 spos += cig_len; | |
| 2721 } | |
| 2722 break; | |
| 2723 | |
| 2724 case BAM_CHARD_CLIP: | |
| 2725 if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos])) | |
| 2726 return -1; | |
| 2727 break; | |
| 2728 | |
| 2729 case BAM_CPAD: | |
| 2730 if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos])) | |
| 2731 return -1; | |
| 2732 break; | |
| 2733 } | |
| 2734 } | |
| 2735 fake_qual = spos; | |
| 2736 cr->aend = MIN(apos, c->ref_end); | |
| 2737 cram_stats_add(c->stats[DS_FN], cr->nfeature); | |
| 2738 } else { | |
| 2739 // Unmapped | |
| 2740 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; | |
| 2741 cr->cigar = 0; | |
| 2742 cr->ncigar = 0; | |
| 2743 cr->nfeature = 0; | |
| 2744 cr->aend = cr->apos; | |
| 2745 for (i = 0; i < cr->len; i++) | |
| 2746 cram_stats_add(c->stats[DS_BA], seq[i]); | |
| 2747 } | |
| 2748 | |
| 2749 /* | |
| 2750 * Append to the qual block now. We do this here as | |
| 2751 * cram_add_substitution() can generate BA/QS events which need to | |
| 2752 * be in the qual block before we append the rest of the data. | |
| 2753 */ | |
| 2754 if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) { | |
| 2755 /* Special case of seq "*" */ | |
| 2756 if (cr->len == 0) { | |
| 2757 cram_stats_add(c->stats[DS_RL], cr->len = fake_qual); | |
| 2758 BLOCK_GROW(s->qual_blk, cr->len); | |
| 2759 cp = (char *)BLOCK_END(s->qual_blk); | |
| 2760 memset(cp, 255, cr->len); | |
| 2761 } else { | |
| 2762 BLOCK_GROW(s->qual_blk, cr->len); | |
| 2763 cp = (char *)BLOCK_END(s->qual_blk); | |
| 2764 char *from = (char *)&bam_qual(b)[0]; | |
| 2765 char *to = &cp[0]; | |
| 2766 memcpy(to, from, cr->len); | |
| 2767 //for (i = 0; i < cr->len; i++) cp[i] = from[i]; | |
| 2768 } | |
| 2769 BLOCK_SIZE(s->qual_blk) += cr->len; | |
| 2770 } else { | |
| 2771 if (cr->len == 0) { | |
| 2772 cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1; | |
| 2773 cram_stats_add(c->stats[DS_RL], cr->len); | |
| 2774 } | |
| 2775 } | |
| 2776 | |
| 2777 /* Now we know apos and aend both, update mate-pair information */ | |
| 2778 { | |
| 2779 int new; | |
| 2780 khint_t k; | |
| 2781 int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0; | |
| 2782 | |
| 2783 //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum, | |
| 2784 // cr->name_len, DSTRING_STR(s->name_ds)+cr->name); | |
| 2785 if (cr->flags & BAM_FPAIRED) { | |
| 2786 char *key = string_ndup(s->pair_keys, | |
| 2787 (char *)BLOCK_DATA(s->name_blk)+cr->name, | |
| 2788 cr->name_len); | |
| 2789 if (!key) | |
| 2790 return -1; | |
| 2791 | |
| 2792 k = kh_put(m_s2i, s->pair[sec], key, &new); | |
| 2793 if (-1 == new) | |
| 2794 return -1; | |
| 2795 else if (new > 0) | |
| 2796 kh_val(s->pair[sec], k) = rnum; | |
| 2797 } else { | |
| 2798 new = 1; | |
| 2799 } | |
| 2800 | |
| 2801 if (new == 0) { | |
| 2802 cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; | |
| 2803 int aleft, aright, sign; | |
| 2804 | |
| 2805 aleft = MIN(cr->apos, p->apos); | |
| 2806 aright = MAX(cr->aend, p->aend); | |
| 2807 if (cr->apos < p->apos) { | |
| 2808 sign = 1; | |
| 2809 } else if (cr->apos > p->apos) { | |
| 2810 sign = -1; | |
| 2811 } else if (cr->flags & BAM_FREAD1) { | |
| 2812 sign = 1; | |
| 2813 } else { | |
| 2814 sign = -1; | |
| 2815 } | |
| 2816 | |
| 2817 //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k)); | |
| 2818 | |
| 2819 // This vs p: tlen, matepos, flags | |
| 2820 if (bam_ins_size(b) != sign*(aright-aleft+1)) | |
| 2821 goto detached; | |
| 2822 | |
| 2823 if (MAX(bam_mate_pos(b)+1, 0) != p->apos) | |
| 2824 goto detached; | |
| 2825 | |
| 2826 if (((bam_flag(b) & BAM_FMUNMAP) != 0) != | |
| 2827 ((p->flags & BAM_FUNMAP) != 0)) | |
| 2828 goto detached; | |
| 2829 | |
| 2830 if (((bam_flag(b) & BAM_FMREVERSE) != 0) != | |
| 2831 ((p->flags & BAM_FREVERSE) != 0)) | |
| 2832 goto detached; | |
| 2833 | |
| 2834 | |
| 2835 // p vs this: tlen, matepos, flags | |
| 2836 if (p->tlen != -sign*(aright-aleft+1)) | |
| 2837 goto detached; | |
| 2838 | |
| 2839 if (p->mate_pos != cr->apos) | |
| 2840 goto detached; | |
| 2841 | |
| 2842 if (((p->flags & BAM_FMUNMAP) != 0) != | |
| 2843 ((p->mate_flags & CRAM_M_UNMAP) != 0)) | |
| 2844 goto detached; | |
| 2845 | |
| 2846 if (((p->flags & BAM_FMREVERSE) != 0) != | |
| 2847 ((p->mate_flags & CRAM_M_REVERSE) != 0)) | |
| 2848 goto detached; | |
| 2849 | |
| 2850 // Supplementary reads are just too ill defined | |
| 2851 if ((cr->flags & BAM_FSUPPLEMENTARY) || | |
| 2852 (p->flags & BAM_FSUPPLEMENTARY)) | |
| 2853 goto detached; | |
| 2854 | |
| 2855 /* | |
| 2856 * The fields below are unused when encoding this read as it is | |
| 2857 * no longer detached. In theory they may get referred to when | |
| 2858 * processing a 3rd or 4th read in this template?, so we set them | |
| 2859 * here just to be sure. | |
| 2860 * | |
| 2861 * They do not need cram_stats_add() calls those as they are | |
| 2862 * not emitted. | |
| 2863 */ | |
| 2864 cr->mate_pos = p->apos; | |
| 2865 cr->tlen = sign*(aright-aleft+1); | |
| 2866 cr->mate_flags = | |
| 2867 ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + | |
| 2868 ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; | |
| 2869 | |
| 2870 // Decrement statistics aggregated earlier | |
| 2871 cram_stats_del(c->stats[DS_NP], p->mate_pos); | |
| 2872 cram_stats_del(c->stats[DS_MF], p->mate_flags); | |
| 2873 cram_stats_del(c->stats[DS_TS], p->tlen); | |
| 2874 cram_stats_del(c->stats[DS_NS], p->mate_ref_id); | |
| 2875 | |
| 2876 /* Similarly we could correct the p-> values too, but these will no | |
| 2877 * longer have any code that refers back to them as the new 'p' | |
| 2878 * for this template is our current 'cr'. | |
| 2879 */ | |
| 2880 //p->mate_pos = cr->apos; | |
| 2881 //p->mate_flags = | |
| 2882 // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + | |
| 2883 // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE; | |
| 2884 //p->tlen = p->apos - cr->aend; | |
| 2885 | |
| 2886 // Clear detached from cr flags | |
| 2887 cr->cram_flags &= ~CRAM_FLAG_DETACHED; | |
| 2888 cram_stats_add(c->stats[DS_CF], cr->cram_flags); | |
| 2889 | |
| 2890 // Clear detached from p flags and set downstream | |
| 2891 cram_stats_del(c->stats[DS_CF], p->cram_flags); | |
| 2892 p->cram_flags &= ~CRAM_FLAG_DETACHED; | |
| 2893 p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; | |
| 2894 cram_stats_add(c->stats[DS_CF], p->cram_flags); | |
| 2895 | |
| 2896 p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1); | |
| 2897 cram_stats_add(c->stats[DS_NF], p->mate_line); | |
| 2898 | |
| 2899 kh_val(s->pair[sec], k) = rnum; | |
| 2900 } else { | |
| 2901 detached: | |
| 2902 //fprintf(stderr, "unpaired\n"); | |
| 2903 | |
| 2904 /* Derive mate flags from this flag */ | |
| 2905 cr->mate_flags = 0; | |
| 2906 if (bam_flag(b) & BAM_FMUNMAP) | |
| 2907 cr->mate_flags |= CRAM_M_UNMAP; | |
| 2908 if (bam_flag(b) & BAM_FMREVERSE) | |
| 2909 cr->mate_flags |= CRAM_M_REVERSE; | |
| 2910 | |
| 2911 cram_stats_add(c->stats[DS_MF], cr->mate_flags); | |
| 2912 | |
| 2913 cr->mate_pos = MAX(bam_mate_pos(b)+1, 0); | |
| 2914 cram_stats_add(c->stats[DS_NP], cr->mate_pos); | |
| 2915 | |
| 2916 cr->tlen = bam_ins_size(b); | |
| 2917 cram_stats_add(c->stats[DS_TS], cr->tlen); | |
| 2918 | |
| 2919 cr->cram_flags |= CRAM_FLAG_DETACHED; | |
| 2920 cram_stats_add(c->stats[DS_CF], cr->cram_flags); | |
| 2921 cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)); | |
| 2922 } | |
| 2923 } | |
| 2924 | |
| 2925 cr->mqual = bam_map_qual(b); | |
| 2926 cram_stats_add(c->stats[DS_MQ], cr->mqual); | |
| 2927 | |
| 2928 cr->mate_ref_id = bam_mate_ref(b); | |
| 2929 | |
| 2930 if (!(bam_flag(b) & BAM_FUNMAP)) { | |
| 2931 if (c->first_base > cr->apos) | |
| 2932 c->first_base = cr->apos; | |
| 2933 | |
| 2934 if (c->last_base < cr->aend) | |
| 2935 c->last_base = cr->aend; | |
| 2936 } | |
| 2937 | |
| 2938 return 0; | |
| 2939 } | |
| 2940 | |
| 2941 /* | |
| 2942 * Write iterator: put BAM format sequences into a CRAM file. | |
| 2943 * We buffer up a containers worth of data at a time. | |
| 2944 * | |
| 2945 * Returns 0 on success | |
| 2946 * -1 on failure | |
| 2947 */ | |
| 2948 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { | |
| 2949 cram_container *c; | |
| 2950 | |
| 2951 if (!fd->ctr) { | |
| 2952 fd->ctr = cram_new_container(fd->seqs_per_slice, | |
| 2953 fd->slices_per_container); | |
| 2954 if (!fd->ctr) | |
| 2955 return -1; | |
| 2956 fd->ctr->record_counter = fd->record_counter; | |
| 2957 } | |
| 2958 c = fd->ctr; | |
| 2959 | |
| 2960 if (!c->slice || c->curr_rec == c->max_rec || | |
| 2961 (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) { | |
| 2962 int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; | |
| 2963 int curr_ref = c->slice ? c->curr_ref : bam_ref(b); | |
| 2964 | |
| 2965 | |
| 2966 /* | |
| 2967 * Start packing slices when we routinely have under 1/4tr full. | |
| 2968 * | |
| 2969 * This option isn't available if we choose to embed references | |
| 2970 * since we can only have one per slice. | |
| 2971 */ | |
| 2972 if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && | |
| 2973 fd->last_slice && fd->last_slice < c->max_rec/4+10 && | |
| 2974 !fd->embed_ref) { | |
| 2975 if (fd->verbose && !c->multi_seq) | |
| 2976 fprintf(stderr, "Multi-ref enabled for this container\n"); | |
| 2977 multi_seq = 1; | |
| 2978 } | |
| 2979 | |
| 2980 slice_rec = c->slice_rec; | |
| 2981 curr_rec = c->curr_rec; | |
| 2982 | |
| 2983 if (CRAM_MAJOR_VERS(fd->version) == 1 || | |
| 2984 c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) { | |
| 2985 if (NULL == (c = cram_next_container(fd, b))) { | |
| 2986 if (fd->ctr) { | |
| 2987 // prevent cram_close attempting to flush | |
| 2988 cram_free_container(fd->ctr); | |
| 2989 fd->ctr = NULL; | |
| 2990 } | |
| 2991 return -1; | |
| 2992 } | |
| 2993 } | |
| 2994 | |
| 2995 /* | |
| 2996 * Due to our processing order, some things we've already done we | |
| 2997 * cannot easily undo. So when we first notice we should be packing | |
| 2998 * multiple sequences per container we emit the small partial | |
| 2999 * container as-is and then start a fresh one in a different mode. | |
| 3000 */ | |
| 3001 if (multi_seq) { | |
| 3002 fd->multi_seq = 1; | |
| 3003 c->multi_seq = 1; | |
| 3004 c->pos_sorted = 0; // required atm for multi_seq slices | |
| 3005 | |
| 3006 if (!c->refs_used) { | |
| 3007 pthread_mutex_lock(&fd->ref_lock); | |
| 3008 c->refs_used = calloc(fd->refs->nref, sizeof(int)); | |
| 3009 pthread_mutex_unlock(&fd->ref_lock); | |
| 3010 if (!c->refs_used) | |
| 3011 return -1; | |
| 3012 } | |
| 3013 } | |
| 3014 | |
| 3015 fd->last_slice = curr_rec - slice_rec; | |
| 3016 c->slice_rec = c->curr_rec; | |
| 3017 | |
| 3018 // Have we seen this reference before? | |
| 3019 if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && | |
| 3020 !fd->unsorted && multi_seq) { | |
| 3021 | |
| 3022 if (!c->refs_used) { | |
| 3023 pthread_mutex_lock(&fd->ref_lock); | |
| 3024 c->refs_used = calloc(fd->refs->nref, sizeof(int)); | |
| 3025 pthread_mutex_unlock(&fd->ref_lock); | |
| 3026 if (!c->refs_used) | |
| 3027 return -1; | |
| 3028 } else if (c->refs_used && c->refs_used[bam_ref(b)]) { | |
| 3029 fprintf(stderr, "Unsorted mode enabled\n"); | |
| 3030 pthread_mutex_lock(&fd->ref_lock); | |
| 3031 fd->unsorted = 1; | |
| 3032 pthread_mutex_unlock(&fd->ref_lock); | |
| 3033 fd->multi_seq = 1; | |
| 3034 } | |
| 3035 } | |
| 3036 | |
| 3037 c->curr_ref = bam_ref(b); | |
| 3038 if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++; | |
| 3039 } | |
| 3040 | |
| 3041 if (!c->bams) { | |
| 3042 /* First time through, allocate a set of bam pointers */ | |
| 3043 pthread_mutex_lock(&fd->bam_list_lock); | |
| 3044 if (fd->bl) { | |
| 3045 spare_bams *spare = fd->bl; | |
| 3046 c->bams = spare->bams; | |
| 3047 fd->bl = spare->next; | |
| 3048 free(spare); | |
| 3049 } else { | |
| 3050 c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *)); | |
| 3051 if (!c->bams) | |
| 3052 return -1; | |
| 3053 } | |
| 3054 pthread_mutex_unlock(&fd->bam_list_lock); | |
| 3055 } | |
| 3056 | |
| 3057 /* Copy or alloc+copy the bam record, for later encoding */ | |
| 3058 if (c->bams[c->curr_c_rec]) | |
| 3059 bam_copy1(c->bams[c->curr_c_rec], b); | |
| 3060 else | |
| 3061 c->bams[c->curr_c_rec] = bam_dup(b); | |
| 3062 | |
| 3063 c->curr_rec++; | |
| 3064 c->curr_c_rec++; | |
| 3065 fd->record_counter++; | |
| 3066 | |
| 3067 return 0; | |
| 3068 } |
