Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/sam.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* sam.c -- SAM and BAM file I/O and manipulation. | |
| 2 | |
| 3 Copyright (C) 2008-2010, 2012-2014 Genome Research Ltd. | |
| 4 Copyright (C) 2010, 2012, 2013 Broad Institute. | |
| 5 | |
| 6 Author: Heng Li <lh3@sanger.ac.uk> | |
| 7 | |
| 8 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 9 of this software and associated documentation files (the "Software"), to deal | |
| 10 in the Software without restriction, including without limitation the rights | |
| 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 12 copies of the Software, and to permit persons to whom the Software is | |
| 13 furnished to do so, subject to the following conditions: | |
| 14 | |
| 15 The above copyright notice and this permission notice shall be included in | |
| 16 all copies or substantial portions of the Software. | |
| 17 | |
| 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
| 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 24 DEALINGS IN THE SOFTWARE. */ | |
| 25 | |
| 26 #include <stdio.h> | |
| 27 #include <stdlib.h> | |
| 28 #include <string.h> | |
| 29 #include <errno.h> | |
| 30 #include <ctype.h> | |
| 31 #include <zlib.h> | |
| 32 #include "htslib/sam.h" | |
| 33 #include "htslib/bgzf.h" | |
| 34 #include "cram/cram.h" | |
| 35 #include "htslib/hfile.h" | |
| 36 | |
| 37 #include "htslib/khash.h" | |
| 38 KHASH_DECLARE(s2i, kh_cstr_t, int64_t) | |
| 39 | |
| 40 typedef khash_t(s2i) sdict_t; | |
| 41 | |
| 42 /********************** | |
| 43 *** BAM header I/O *** | |
| 44 **********************/ | |
| 45 | |
| 46 bam_hdr_t *bam_hdr_init() | |
| 47 { | |
| 48 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t)); | |
| 49 } | |
| 50 | |
| 51 void bam_hdr_destroy(bam_hdr_t *h) | |
| 52 { | |
| 53 int32_t i; | |
| 54 if (h == NULL) return; | |
| 55 if (h->target_name) { | |
| 56 for (i = 0; i < h->n_targets; ++i) | |
| 57 free(h->target_name[i]); | |
| 58 free(h->target_name); | |
| 59 free(h->target_len); | |
| 60 } | |
| 61 free(h->text); free(h->cigar_tab); | |
| 62 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict); | |
| 63 free(h); | |
| 64 } | |
| 65 | |
| 66 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0) | |
| 67 { | |
| 68 if (h0 == NULL) return NULL; | |
| 69 bam_hdr_t *h; | |
| 70 if ((h = bam_hdr_init()) == NULL) return NULL; | |
| 71 // copy the simple data | |
| 72 h->n_targets = h0->n_targets; | |
| 73 h->ignore_sam_err = h0->ignore_sam_err; | |
| 74 h->l_text = h0->l_text; | |
| 75 // Then the pointery stuff | |
| 76 h->cigar_tab = NULL; | |
| 77 h->sdict = NULL; | |
| 78 h->text = (char*)calloc(h->l_text + 1, 1); | |
| 79 memcpy(h->text, h0->text, h->l_text); | |
| 80 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); | |
| 81 h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); | |
| 82 int i; | |
| 83 for (i = 0; i < h->n_targets; ++i) { | |
| 84 h->target_len[i] = h0->target_len[i]; | |
| 85 h->target_name[i] = strdup(h0->target_name[i]); | |
| 86 } | |
| 87 return h; | |
| 88 } | |
| 89 | |
| 90 | |
| 91 static bam_hdr_t *hdr_from_dict(sdict_t *d) | |
| 92 { | |
| 93 bam_hdr_t *h; | |
| 94 khint_t k; | |
| 95 h = bam_hdr_init(); | |
| 96 h->sdict = d; | |
| 97 h->n_targets = kh_size(d); | |
| 98 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets); | |
| 99 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets); | |
| 100 for (k = kh_begin(d); k != kh_end(d); ++k) { | |
| 101 if (!kh_exist(d, k)) continue; | |
| 102 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k); | |
| 103 h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32; | |
| 104 kh_val(d, k) >>= 32; | |
| 105 } | |
| 106 return h; | |
| 107 } | |
| 108 | |
| 109 bam_hdr_t *bam_hdr_read(BGZF *fp) | |
| 110 { | |
| 111 bam_hdr_t *h; | |
| 112 char buf[4]; | |
| 113 int magic_len, has_EOF; | |
| 114 int32_t i = 1, name_len; | |
| 115 // check EOF | |
| 116 has_EOF = bgzf_check_EOF(fp); | |
| 117 if (has_EOF < 0) { | |
| 118 perror("[W::sam_hdr_read] bgzf_check_EOF"); | |
| 119 } else if (has_EOF == 0 && hts_verbose >= 2) | |
| 120 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__); | |
| 121 // read "BAM1" | |
| 122 magic_len = bgzf_read(fp, buf, 4); | |
| 123 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) { | |
| 124 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__); | |
| 125 return 0; | |
| 126 } | |
| 127 h = bam_hdr_init(); | |
| 128 // read plain text and the number of reference sequences | |
| 129 bgzf_read(fp, &h->l_text, 4); | |
| 130 if (fp->is_be) ed_swap_4p(&h->l_text); | |
| 131 h->text = (char*)malloc(h->l_text + 1); | |
| 132 h->text[h->l_text] = 0; // make sure it is NULL terminated | |
| 133 bgzf_read(fp, h->text, h->l_text); | |
| 134 bgzf_read(fp, &h->n_targets, 4); | |
| 135 if (fp->is_be) ed_swap_4p(&h->n_targets); | |
| 136 // read reference sequence names and lengths | |
| 137 h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); | |
| 138 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); | |
| 139 for (i = 0; i != h->n_targets; ++i) { | |
| 140 bgzf_read(fp, &name_len, 4); | |
| 141 if (fp->is_be) ed_swap_4p(&name_len); | |
| 142 h->target_name[i] = (char*)calloc(name_len, 1); | |
| 143 bgzf_read(fp, h->target_name[i], name_len); | |
| 144 bgzf_read(fp, &h->target_len[i], 4); | |
| 145 if (fp->is_be) ed_swap_4p(&h->target_len[i]); | |
| 146 } | |
| 147 return h; | |
| 148 } | |
| 149 | |
| 150 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) | |
| 151 { | |
| 152 char buf[4]; | |
| 153 int32_t i, name_len, x; | |
| 154 // write "BAM1" | |
| 155 strncpy(buf, "BAM\1", 4); | |
| 156 bgzf_write(fp, buf, 4); | |
| 157 // write plain text and the number of reference sequences | |
| 158 if (fp->is_be) { | |
| 159 x = ed_swap_4(h->l_text); | |
| 160 bgzf_write(fp, &x, 4); | |
| 161 if (h->l_text) bgzf_write(fp, h->text, h->l_text); | |
| 162 x = ed_swap_4(h->n_targets); | |
| 163 bgzf_write(fp, &x, 4); | |
| 164 } else { | |
| 165 bgzf_write(fp, &h->l_text, 4); | |
| 166 if (h->l_text) bgzf_write(fp, h->text, h->l_text); | |
| 167 bgzf_write(fp, &h->n_targets, 4); | |
| 168 } | |
| 169 // write sequence names and lengths | |
| 170 for (i = 0; i != h->n_targets; ++i) { | |
| 171 char *p = h->target_name[i]; | |
| 172 name_len = strlen(p) + 1; | |
| 173 if (fp->is_be) { | |
| 174 x = ed_swap_4(name_len); | |
| 175 bgzf_write(fp, &x, 4); | |
| 176 } else bgzf_write(fp, &name_len, 4); | |
| 177 bgzf_write(fp, p, name_len); | |
| 178 if (fp->is_be) { | |
| 179 x = ed_swap_4(h->target_len[i]); | |
| 180 bgzf_write(fp, &x, 4); | |
| 181 } else bgzf_write(fp, &h->target_len[i], 4); | |
| 182 } | |
| 183 bgzf_flush(fp); | |
| 184 return 0; | |
| 185 } | |
| 186 | |
| 187 int bam_name2id(bam_hdr_t *h, const char *ref) | |
| 188 { | |
| 189 sdict_t *d = (sdict_t*)h->sdict; | |
| 190 khint_t k; | |
| 191 if (h->sdict == 0) { | |
| 192 int i, absent; | |
| 193 d = kh_init(s2i); | |
| 194 for (i = 0; i < h->n_targets; ++i) { | |
| 195 k = kh_put(s2i, d, h->target_name[i], &absent); | |
| 196 kh_val(d, k) = i; | |
| 197 } | |
| 198 h->sdict = d; | |
| 199 } | |
| 200 k = kh_get(s2i, d, ref); | |
| 201 return k == kh_end(d)? -1 : kh_val(d, k); | |
| 202 } | |
| 203 | |
| 204 /************************* | |
| 205 *** BAM alignment I/O *** | |
| 206 *************************/ | |
| 207 | |
| 208 bam1_t *bam_init1() | |
| 209 { | |
| 210 return (bam1_t*)calloc(1, sizeof(bam1_t)); | |
| 211 } | |
| 212 | |
| 213 void bam_destroy1(bam1_t *b) | |
| 214 { | |
| 215 if (b == 0) return; | |
| 216 free(b->data); free(b); | |
| 217 } | |
| 218 | |
| 219 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) | |
| 220 { | |
| 221 uint8_t *data = bdst->data; | |
| 222 int m_data = bdst->m_data; // backup data and m_data | |
| 223 if (m_data < bsrc->l_data) { // double the capacity | |
| 224 m_data = bsrc->l_data; kroundup32(m_data); | |
| 225 data = (uint8_t*)realloc(data, m_data); | |
| 226 } | |
| 227 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data | |
| 228 *bdst = *bsrc; // copy the rest | |
| 229 // restore the backup | |
| 230 bdst->m_data = m_data; | |
| 231 bdst->data = data; | |
| 232 return bdst; | |
| 233 } | |
| 234 | |
| 235 bam1_t *bam_dup1(const bam1_t *bsrc) | |
| 236 { | |
| 237 if (bsrc == NULL) return NULL; | |
| 238 bam1_t *bdst = bam_init1(); | |
| 239 if (bdst == NULL) return NULL; | |
| 240 return bam_copy1(bdst, bsrc); | |
| 241 } | |
| 242 | |
| 243 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) | |
| 244 { | |
| 245 int k, l; | |
| 246 for (k = l = 0; k < n_cigar; ++k) | |
| 247 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) | |
| 248 l += bam_cigar_oplen(cigar[k]); | |
| 249 return l; | |
| 250 } | |
| 251 | |
| 252 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) | |
| 253 { | |
| 254 int k, l; | |
| 255 for (k = l = 0; k < n_cigar; ++k) | |
| 256 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) | |
| 257 l += bam_cigar_oplen(cigar[k]); | |
| 258 return l; | |
| 259 } | |
| 260 | |
| 261 int32_t bam_endpos(const bam1_t *b) | |
| 262 { | |
| 263 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) | |
| 264 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); | |
| 265 else | |
| 266 return b->core.pos + 1; | |
| 267 } | |
| 268 | |
| 269 static inline int aux_type2size(uint8_t type) | |
| 270 { | |
| 271 switch (type) { | |
| 272 case 'A': case 'c': case 'C': | |
| 273 return 1; | |
| 274 case 's': case 'S': | |
| 275 return 2; | |
| 276 case 'i': case 'I': case 'f': | |
| 277 return 4; | |
| 278 case 'd': | |
| 279 return 8; | |
| 280 case 'Z': case 'H': case 'B': | |
| 281 return type; | |
| 282 default: | |
| 283 return 0; | |
| 284 } | |
| 285 } | |
| 286 | |
| 287 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) | |
| 288 { | |
| 289 uint8_t *s; | |
| 290 uint32_t *cigar = (uint32_t*)(data + c->l_qname); | |
| 291 uint32_t i, n; | |
| 292 s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; | |
| 293 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); | |
| 294 while (s < data + l_data) { | |
| 295 int size; | |
| 296 s += 2; // skip key | |
| 297 size = aux_type2size(*s); ++s; // skip type | |
| 298 switch (size) { | |
| 299 case 1: ++s; break; | |
| 300 case 2: ed_swap_2p(s); s += 2; break; | |
| 301 case 4: ed_swap_4p(s); s += 4; break; | |
| 302 case 8: ed_swap_8p(s); s += 8; break; | |
| 303 case 'Z': | |
| 304 case 'H': | |
| 305 while (*s) ++s; | |
| 306 ++s; | |
| 307 break; | |
| 308 case 'B': | |
| 309 size = aux_type2size(*s); ++s; | |
| 310 if (is_host) memcpy(&n, s, 4), ed_swap_4p(s); | |
| 311 else ed_swap_4p(s), memcpy(&n, s, 4); | |
| 312 s += 4; | |
| 313 switch (size) { | |
| 314 case 1: s += n; break; | |
| 315 case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break; | |
| 316 case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break; | |
| 317 case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break; | |
| 318 } | |
| 319 break; | |
| 320 } | |
| 321 } | |
| 322 } | |
| 323 | |
| 324 int bam_read1(BGZF *fp, bam1_t *b) | |
| 325 { | |
| 326 bam1_core_t *c = &b->core; | |
| 327 int32_t block_len, ret, i; | |
| 328 uint32_t x[8]; | |
| 329 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { | |
| 330 if (ret == 0) return -1; // normal end-of-file | |
| 331 else return -2; // truncated | |
| 332 } | |
| 333 if (bgzf_read(fp, x, 32) != 32) return -3; | |
| 334 if (fp->is_be) { | |
| 335 ed_swap_4p(&block_len); | |
| 336 for (i = 0; i < 8; ++i) ed_swap_4p(x + i); | |
| 337 } | |
| 338 c->tid = x[0]; c->pos = x[1]; | |
| 339 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; | |
| 340 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; | |
| 341 c->l_qseq = x[4]; | |
| 342 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; | |
| 343 b->l_data = block_len - 32; | |
| 344 if (b->l_data < 0 || c->l_qseq < 0) return -4; | |
| 345 if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data) | |
| 346 return -4; | |
| 347 if (b->m_data < b->l_data) { | |
| 348 b->m_data = b->l_data; | |
| 349 kroundup32(b->m_data); | |
| 350 b->data = (uint8_t*)realloc(b->data, b->m_data); | |
| 351 if (!b->data) | |
| 352 return -4; | |
| 353 } | |
| 354 if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4; | |
| 355 //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; | |
| 356 if (fp->is_be) swap_data(c, b->l_data, b->data, 0); | |
| 357 return 4 + block_len; | |
| 358 } | |
| 359 | |
| 360 int bam_write1(BGZF *fp, const bam1_t *b) | |
| 361 { | |
| 362 const bam1_core_t *c = &b->core; | |
| 363 uint32_t x[8], block_len = b->l_data + 32, y; | |
| 364 int i, ok; | |
| 365 x[0] = c->tid; | |
| 366 x[1] = c->pos; | |
| 367 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; | |
| 368 x[3] = (uint32_t)c->flag<<16 | c->n_cigar; | |
| 369 x[4] = c->l_qseq; | |
| 370 x[5] = c->mtid; | |
| 371 x[6] = c->mpos; | |
| 372 x[7] = c->isize; | |
| 373 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); | |
| 374 if (fp->is_be) { | |
| 375 for (i = 0; i < 8; ++i) ed_swap_4p(x + i); | |
| 376 y = block_len; | |
| 377 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); | |
| 378 swap_data(c, b->l_data, b->data, 1); | |
| 379 } else { | |
| 380 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0); | |
| 381 } | |
| 382 if (ok) ok = (bgzf_write(fp, x, 32) >= 0); | |
| 383 if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0); | |
| 384 if (fp->is_be) swap_data(c, b->l_data, b->data, 0); | |
| 385 return ok? 4 + block_len : -1; | |
| 386 } | |
| 387 | |
| 388 /******************** | |
| 389 *** BAM indexing *** | |
| 390 ********************/ | |
| 391 | |
| 392 static hts_idx_t *bam_index(BGZF *fp, int min_shift) | |
| 393 { | |
| 394 int n_lvls, i, fmt; | |
| 395 bam1_t *b; | |
| 396 hts_idx_t *idx; | |
| 397 bam_hdr_t *h; | |
| 398 h = bam_hdr_read(fp); | |
| 399 if (min_shift > 0) { | |
| 400 int64_t max_len = 0, s; | |
| 401 for (i = 0; i < h->n_targets; ++i) | |
| 402 if (max_len < h->target_len[i]) max_len = h->target_len[i]; | |
| 403 max_len += 256; | |
| 404 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); | |
| 405 fmt = HTS_FMT_CSI; | |
| 406 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; | |
| 407 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls); | |
| 408 bam_hdr_destroy(h); | |
| 409 b = bam_init1(); | |
| 410 while (bam_read1(fp, b) >= 0) { | |
| 411 int l, ret; | |
| 412 l = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); | |
| 413 if (l == 0) l = 1; // no zero-length records | |
| 414 ret = hts_idx_push(idx, b->core.tid, b->core.pos, b->core.pos + l, bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP)); | |
| 415 if (ret < 0) | |
| 416 { | |
| 417 // unsorted | |
| 418 bam_destroy1(b); | |
| 419 hts_idx_destroy(idx); | |
| 420 return NULL; | |
| 421 } | |
| 422 } | |
| 423 hts_idx_finish(idx, bgzf_tell(fp)); | |
| 424 bam_destroy1(b); | |
| 425 return idx; | |
| 426 } | |
| 427 | |
| 428 int bam_index_build(const char *fn, int min_shift) | |
| 429 { | |
| 430 hts_idx_t *idx; | |
| 431 htsFile *fp; | |
| 432 int ret = 0; | |
| 433 | |
| 434 if ((fp = hts_open(fn, "r")) == 0) return -1; | |
| 435 switch (fp->format.format) { | |
| 436 case cram: | |
| 437 ret = cram_index_build(fp->fp.cram, fn); | |
| 438 break; | |
| 439 | |
| 440 case bam: | |
| 441 idx = bam_index(fp->fp.bgzf, min_shift); | |
| 442 if (idx) { | |
| 443 hts_idx_save(idx, fn, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); | |
| 444 hts_idx_destroy(idx); | |
| 445 } | |
| 446 else ret = -1; | |
| 447 break; | |
| 448 | |
| 449 default: | |
| 450 ret = -1; | |
| 451 break; | |
| 452 } | |
| 453 hts_close(fp); | |
| 454 | |
| 455 return ret; | |
| 456 } | |
| 457 | |
| 458 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end) | |
| 459 { | |
| 460 bam1_t *b = bv; | |
| 461 int ret; | |
| 462 if ((ret = bam_read1(fp, b)) >= 0) { | |
| 463 *tid = b->core.tid; *beg = b->core.pos; | |
| 464 *end = b->core.pos + (b->core.n_cigar? bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)) : 1); | |
| 465 } | |
| 466 return ret; | |
| 467 } | |
| 468 | |
| 469 // This is used only with read_rest=1 iterators, so need not set tid/beg/end. | |
| 470 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) | |
| 471 { | |
| 472 htsFile *fp = fpv; | |
| 473 bam1_t *b = bv; | |
| 474 return cram_get_bam_seq(fp->fp.cram, &b); | |
| 475 } | |
| 476 | |
| 477 // This is used only with read_rest=1 iterators, so need not set tid/beg/end. | |
| 478 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end) | |
| 479 { | |
| 480 htsFile *fp = fpv; | |
| 481 bam1_t *b = bv; | |
| 482 switch (fp->format.format) { | |
| 483 case bam: return bam_read1(bgzfp, b); | |
| 484 case cram: return cram_get_bam_seq(fp->fp.cram, &b); | |
| 485 default: | |
| 486 // TODO Need headers available to implement this for SAM files | |
| 487 fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n"); | |
| 488 abort(); | |
| 489 } | |
| 490 } | |
| 491 | |
| 492 // The CRAM implementation stores the loaded index within the cram_fd rather | |
| 493 // than separately as is done elsewhere in htslib. So if p is a pointer to | |
| 494 // an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an | |
| 495 // hts_cram_idx_t and should be cast accordingly. | |
| 496 typedef struct hts_cram_idx_t { | |
| 497 int fmt; | |
| 498 cram_fd *cram; | |
| 499 } hts_cram_idx_t; | |
| 500 | |
| 501 hts_idx_t *sam_index_load(samFile *fp, const char *fn) | |
| 502 { | |
| 503 switch (fp->format.format) { | |
| 504 case bam: | |
| 505 return bam_index_load(fn); | |
| 506 | |
| 507 case cram: { | |
| 508 if (cram_index_load(fp->fp.cram, fn) < 0) return NULL; | |
| 509 // Cons up a fake "index" just pointing at the associated cram_fd: | |
| 510 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); | |
| 511 if (idx == NULL) return NULL; | |
| 512 idx->fmt = HTS_FMT_CRAI; | |
| 513 idx->cram = fp->fp.cram; | |
| 514 return (hts_idx_t *) idx; | |
| 515 } | |
| 516 | |
| 517 default: | |
| 518 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t | |
| 519 } | |
| 520 } | |
| 521 | |
| 522 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) | |
| 523 { | |
| 524 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; | |
| 525 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); | |
| 526 if (iter == NULL) return NULL; | |
| 527 | |
| 528 // Cons up a dummy iterator for which hts_itr_next() will simply invoke | |
| 529 // the readrec function: | |
| 530 iter->read_rest = 1; | |
| 531 iter->off = NULL; | |
| 532 iter->bins.a = NULL; | |
| 533 iter->readrec = readrec; | |
| 534 | |
| 535 if (tid >= 0) { | |
| 536 cram_range r = { tid, beg+1, end }; | |
| 537 if (cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r) != 0) { free(iter); return NULL; } | |
| 538 iter->curr_off = 0; | |
| 539 // The following fields are not required by hts_itr_next(), but are | |
| 540 // filled in in case user code wants to look at them. | |
| 541 iter->tid = tid; | |
| 542 iter->beg = beg; | |
| 543 iter->end = end; | |
| 544 } | |
| 545 else switch (tid) { | |
| 546 case HTS_IDX_REST: | |
| 547 iter->curr_off = 0; | |
| 548 break; | |
| 549 case HTS_IDX_NONE: | |
| 550 iter->curr_off = 0; | |
| 551 iter->finished = 1; | |
| 552 break; | |
| 553 default: | |
| 554 fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid); | |
| 555 abort(); | |
| 556 break; | |
| 557 } | |
| 558 | |
| 559 return iter; | |
| 560 } | |
| 561 | |
| 562 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) | |
| 563 { | |
| 564 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; | |
| 565 if (idx == NULL) | |
| 566 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec); | |
| 567 else if (cidx->fmt == HTS_FMT_CRAI) | |
| 568 return cram_itr_query(idx, tid, beg, end, cram_readrec); | |
| 569 else | |
| 570 return hts_itr_query(idx, tid, beg, end, bam_readrec); | |
| 571 } | |
| 572 | |
| 573 static int cram_name2id(void *fdv, const char *ref) | |
| 574 { | |
| 575 cram_fd *fd = (cram_fd *) fdv; | |
| 576 return sam_hdr_name2ref(fd->header, ref); | |
| 577 } | |
| 578 | |
| 579 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region) | |
| 580 { | |
| 581 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; | |
| 582 if (cidx->fmt == HTS_FMT_CRAI) | |
| 583 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec); | |
| 584 else | |
| 585 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec); | |
| 586 } | |
| 587 | |
| 588 /********************** | |
| 589 *** SAM header I/O *** | |
| 590 **********************/ | |
| 591 | |
| 592 #include "htslib/kseq.h" | |
| 593 #include "htslib/kstring.h" | |
| 594 | |
| 595 bam_hdr_t *sam_hdr_parse(int l_text, const char *text) | |
| 596 { | |
| 597 const char *q, *r, *p; | |
| 598 khash_t(s2i) *d; | |
| 599 d = kh_init(s2i); | |
| 600 for (p = text; *p; ++p) { | |
| 601 if (strncmp(p, "@SQ", 3) == 0) { | |
| 602 char *sn = 0; | |
| 603 int ln = -1; | |
| 604 for (q = p + 4;; ++q) { | |
| 605 if (strncmp(q, "SN:", 3) == 0) { | |
| 606 q += 3; | |
| 607 for (r = q; *r != '\t' && *r != '\n'; ++r); | |
| 608 sn = (char*)calloc(r - q + 1, 1); | |
| 609 strncpy(sn, q, r - q); | |
| 610 q = r; | |
| 611 } else if (strncmp(q, "LN:", 3) == 0) | |
| 612 ln = strtol(q + 3, (char**)&q, 10); | |
| 613 while (*q != '\t' && *q != '\n') ++q; | |
| 614 if (*q == '\n') break; | |
| 615 } | |
| 616 p = q; | |
| 617 if (sn && ln >= 0) { | |
| 618 khint_t k; | |
| 619 int absent; | |
| 620 k = kh_put(s2i, d, sn, &absent); | |
| 621 if (!absent) { | |
| 622 if (hts_verbose >= 2) | |
| 623 fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn); | |
| 624 free(sn); | |
| 625 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln; | |
| 626 } | |
| 627 } | |
| 628 while (*p != '\n') ++p; | |
| 629 } | |
| 630 return hdr_from_dict(d); | |
| 631 } | |
| 632 | |
| 633 bam_hdr_t *sam_hdr_read(htsFile *fp) | |
| 634 { | |
| 635 switch (fp->format.format) { | |
| 636 case bam: | |
| 637 return bam_hdr_read(fp->fp.bgzf); | |
| 638 | |
| 639 case cram: | |
| 640 return cram_header_to_bam(fp->fp.cram->header); | |
| 641 | |
| 642 case sam: { | |
| 643 kstring_t str; | |
| 644 bam_hdr_t *h; | |
| 645 int has_SQ = 0; | |
| 646 str.l = str.m = 0; str.s = 0; | |
| 647 while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) { | |
| 648 if (fp->line.s[0] != '@') break; | |
| 649 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1; | |
| 650 kputsn(fp->line.s, fp->line.l, &str); | |
| 651 kputc('\n', &str); | |
| 652 } | |
| 653 if (! has_SQ && fp->fn_aux) { | |
| 654 char line[2048]; | |
| 655 FILE *f = fopen(fp->fn_aux, "r"); | |
| 656 if (f == NULL) return NULL; | |
| 657 while (fgets(line, sizeof line, f)) { | |
| 658 const char *name = strtok(line, "\t"); | |
| 659 const char *length = strtok(NULL, "\t"); | |
| 660 ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length); | |
| 661 } | |
| 662 fclose(f); | |
| 663 } | |
| 664 if (str.l == 0) kputsn("", 0, &str); | |
| 665 h = sam_hdr_parse(str.l, str.s); | |
| 666 h->l_text = str.l; h->text = str.s; | |
| 667 return h; | |
| 668 } | |
| 669 | |
| 670 default: | |
| 671 abort(); | |
| 672 } | |
| 673 } | |
| 674 | |
| 675 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h) | |
| 676 { | |
| 677 switch (fp->format.format) { | |
| 678 case binary_format: | |
| 679 fp->format.category = sequence_data; | |
| 680 fp->format.format = bam; | |
| 681 /* fall-through */ | |
| 682 case bam: | |
| 683 bam_hdr_write(fp->fp.bgzf, h); | |
| 684 break; | |
| 685 | |
| 686 case cram: { | |
| 687 cram_fd *fd = fp->fp.cram; | |
| 688 if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1; | |
| 689 if (fp->fn_aux) | |
| 690 cram_load_reference(fd, fp->fn_aux); | |
| 691 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; | |
| 692 } | |
| 693 break; | |
| 694 | |
| 695 case text_format: | |
| 696 fp->format.category = sequence_data; | |
| 697 fp->format.format = sam; | |
| 698 /* fall-through */ | |
| 699 case sam: { | |
| 700 char *p; | |
| 701 hputs(h->text, fp->fp.hfile); | |
| 702 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!! | |
| 703 if (p == 0) { | |
| 704 int i; | |
| 705 for (i = 0; i < h->n_targets; ++i) { | |
| 706 fp->line.l = 0; | |
| 707 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line); | |
| 708 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line); | |
| 709 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; | |
| 710 } | |
| 711 } | |
| 712 if ( hflush(fp->fp.hfile) != 0 ) return -1; | |
| 713 } | |
| 714 break; | |
| 715 | |
| 716 default: | |
| 717 abort(); | |
| 718 } | |
| 719 return 0; | |
| 720 } | |
| 721 | |
| 722 /********************** | |
| 723 *** SAM record I/O *** | |
| 724 **********************/ | |
| 725 | |
| 726 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) | |
| 727 { | |
| 728 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0 | |
| 729 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t' | |
| 730 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l) | |
| 731 #define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0) | |
| 732 #define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__) | |
| 733 | |
| 734 uint8_t *t; | |
| 735 char *p = s->s, *q; | |
| 736 int i; | |
| 737 kstring_t str; | |
| 738 bam1_core_t *c = &b->core; | |
| 739 | |
| 740 str.l = b->l_data = 0; | |
| 741 str.s = (char*)b->data; str.m = b->m_data; | |
| 742 memset(c, 0, 32); | |
| 743 if (h->cigar_tab == 0) { | |
| 744 h->cigar_tab = (int8_t*) malloc(128); | |
| 745 for (i = 0; i < 128; ++i) | |
| 746 h->cigar_tab[i] = -1; | |
| 747 for (i = 0; BAM_CIGAR_STR[i]; ++i) | |
| 748 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i; | |
| 749 } | |
| 750 // qname | |
| 751 q = _read_token(p); | |
| 752 kputsn_(q, p - q, &str); | |
| 753 c->l_qname = p - q; | |
| 754 // flag | |
| 755 c->flag = strtol(p, &p, 0); | |
| 756 if (*p++ != '\t') goto err_ret; // malformated flag | |
| 757 // chr | |
| 758 q = _read_token(p); | |
| 759 if (strcmp(q, "*")) { | |
| 760 _parse_err(h->n_targets == 0, "missing SAM header"); | |
| 761 c->tid = bam_name2id(h, q); | |
| 762 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped"); | |
| 763 } else c->tid = -1; | |
| 764 // pos | |
| 765 c->pos = strtol(p, &p, 10) - 1; | |
| 766 if (*p++ != '\t') goto err_ret; | |
| 767 if (c->pos < 0 && c->tid >= 0) { | |
| 768 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); | |
| 769 c->tid = -1; | |
| 770 } | |
| 771 if (c->tid < 0) c->flag |= BAM_FUNMAP; | |
| 772 // mapq | |
| 773 c->qual = strtol(p, &p, 10); | |
| 774 if (*p++ != '\t') goto err_ret; | |
| 775 // cigar | |
| 776 if (*p != '*') { | |
| 777 uint32_t *cigar; | |
| 778 size_t n_cigar = 0; | |
| 779 for (q = p; *p && *p != '\t'; ++p) | |
| 780 if (!isdigit(*p)) ++n_cigar; | |
| 781 if (*p++ != '\t') goto err_ret; | |
| 782 _parse_err(n_cigar >= 65536, "too many CIGAR operations"); | |
| 783 c->n_cigar = n_cigar; | |
| 784 _get_mem(uint32_t, &cigar, &str, c->n_cigar<<2); | |
| 785 for (i = 0; i < c->n_cigar; ++i, ++q) { | |
| 786 int op; | |
| 787 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT; | |
| 788 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q]; | |
| 789 _parse_err(op < 0, "unrecognized CIGAR operator"); | |
| 790 cigar[i] |= op; | |
| 791 } | |
| 792 i = bam_cigar2rlen(c->n_cigar, cigar); | |
| 793 } else { | |
| 794 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); | |
| 795 c->flag |= BAM_FUNMAP; | |
| 796 q = _read_token(p); | |
| 797 i = 1; | |
| 798 } | |
| 799 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5); | |
| 800 // mate chr | |
| 801 q = _read_token(p); | |
| 802 if (strcmp(q, "=") == 0) c->mtid = c->tid; | |
| 803 else if (strcmp(q, "*") == 0) c->mtid = -1; | |
| 804 else c->mtid = bam_name2id(h, q); | |
| 805 // mpos | |
| 806 c->mpos = strtol(p, &p, 10) - 1; | |
| 807 if (*p++ != '\t') goto err_ret; | |
| 808 if (c->mpos < 0 && c->mtid >= 0) { | |
| 809 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); | |
| 810 c->mtid = -1; | |
| 811 } | |
| 812 // tlen | |
| 813 c->isize = strtol(p, &p, 10); | |
| 814 if (*p++ != '\t') goto err_ret; | |
| 815 // seq | |
| 816 q = _read_token(p); | |
| 817 if (strcmp(q, "*")) { | |
| 818 c->l_qseq = p - q - 1; | |
| 819 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname)); | |
| 820 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length"); | |
| 821 i = (c->l_qseq + 1) >> 1; | |
| 822 _get_mem(uint8_t, &t, &str, i); | |
| 823 memset(t, 0, i); | |
| 824 for (i = 0; i < c->l_qseq; ++i) | |
| 825 t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2); | |
| 826 } else c->l_qseq = 0; | |
| 827 // qual | |
| 828 q = _read_token_aux(p); | |
| 829 _get_mem(uint8_t, &t, &str, c->l_qseq); | |
| 830 if (strcmp(q, "*")) { | |
| 831 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length"); | |
| 832 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33; | |
| 833 } else memset(t, 0xff, c->l_qseq); | |
| 834 // aux | |
| 835 // Note that (like the bam1_core_t fields) this aux data in b->data is | |
| 836 // stored in host endianness; so there is no byte swapping needed here. | |
| 837 while (p < s->s + s->l) { | |
| 838 uint8_t type; | |
| 839 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays | |
| 840 _parse_err(p - q - 1 < 6, "incomplete aux field"); | |
| 841 kputsn_(q, 2, &str); | |
| 842 q += 3; type = *q++; ++q; // q points to value | |
| 843 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { | |
| 844 kputc_('A', &str); | |
| 845 kputc_(*q, &str); | |
| 846 } else if (type == 'i' || type == 'I') { | |
| 847 if (*q == '-') { | |
| 848 long x = strtol(q, &q, 10); | |
| 849 if (x >= INT8_MIN) { | |
| 850 kputc_('c', &str); kputc_(x, &str); | |
| 851 } else if (x >= INT16_MIN) { | |
| 852 int16_t y = x; | |
| 853 kputc_('s', &str); kputsn_((char*)&y, 2, &str); | |
| 854 } else { | |
| 855 int32_t y = x; | |
| 856 kputc_('i', &str); kputsn_(&y, 4, &str); | |
| 857 } | |
| 858 } else { | |
| 859 unsigned long x = strtoul(q, &q, 10); | |
| 860 if (x <= UINT8_MAX) { | |
| 861 kputc_('C', &str); kputc_(x, &str); | |
| 862 } else if (x <= UINT16_MAX) { | |
| 863 uint16_t y = x; | |
| 864 kputc_('S', &str); kputsn_(&y, 2, &str); | |
| 865 } else { | |
| 866 uint32_t y = x; | |
| 867 kputc_('I', &str); kputsn_(&y, 4, &str); | |
| 868 } | |
| 869 } | |
| 870 } else if (type == 'f') { | |
| 871 float x; | |
| 872 x = strtod(q, &q); | |
| 873 kputc_('f', &str); kputsn_(&x, 4, &str); | |
| 874 } else if (type == 'd') { | |
| 875 double x; | |
| 876 x = strtod(q, &q); | |
| 877 kputc_('d', &str); kputsn_(&x, 8, &str); | |
| 878 } else if (type == 'Z' || type == 'H') { | |
| 879 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL | |
| 880 } else if (type == 'B') { | |
| 881 int32_t n; | |
| 882 char *r; | |
| 883 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field"); | |
| 884 type = *q++; // q points to the first ',' following the typing byte | |
| 885 for (r = q, n = 0; *r; ++r) | |
| 886 if (*r == ',') ++n; | |
| 887 kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str); | |
| 888 // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_() | |
| 889 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); } | |
| 890 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); } | |
| 891 else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); } | |
| 892 else if (type == 'S') while (q + 1 < p) { uint16_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 2, &str); } | |
| 893 else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); } | |
| 894 else if (type == 'I') while (q + 1 < p) { uint32_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 4, &str); } | |
| 895 else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); } | |
| 896 else _parse_err(1, "unrecognized type"); | |
| 897 } else _parse_err(1, "unrecognized type"); | |
| 898 } | |
| 899 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; | |
| 900 return 0; | |
| 901 | |
| 902 #undef _parse_warn | |
| 903 #undef _parse_err | |
| 904 #undef _get_mem | |
| 905 #undef _read_token_aux | |
| 906 #undef _read_token | |
| 907 err_ret: | |
| 908 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; | |
| 909 return -2; | |
| 910 } | |
| 911 | |
| 912 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b) | |
| 913 { | |
| 914 switch (fp->format.format) { | |
| 915 case bam: { | |
| 916 int r = bam_read1(fp->fp.bgzf, b); | |
| 917 if (r >= 0) { | |
| 918 if (b->core.tid >= h->n_targets || b->core.tid < -1 || | |
| 919 b->core.mtid >= h->n_targets || b->core.mtid < -1) | |
| 920 return -3; | |
| 921 } | |
| 922 return r; | |
| 923 } | |
| 924 | |
| 925 case cram: | |
| 926 return cram_get_bam_seq(fp->fp.cram, &b); | |
| 927 | |
| 928 case sam: { | |
| 929 int ret; | |
| 930 err_recover: | |
| 931 if (fp->line.l == 0) { | |
| 932 ret = hts_getline(fp, KS_SEP_LINE, &fp->line); | |
| 933 if (ret < 0) return -1; | |
| 934 } | |
| 935 ret = sam_parse1(&fp->line, h, b); | |
| 936 fp->line.l = 0; | |
| 937 if (ret < 0) { | |
| 938 if (hts_verbose >= 1) | |
| 939 fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno); | |
| 940 if (h->ignore_sam_err) goto err_recover; | |
| 941 } | |
| 942 return ret; | |
| 943 } | |
| 944 | |
| 945 default: | |
| 946 abort(); | |
| 947 } | |
| 948 } | |
| 949 | |
| 950 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) | |
| 951 { | |
| 952 int i; | |
| 953 uint8_t *s; | |
| 954 const bam1_core_t *c = &b->core; | |
| 955 | |
| 956 str->l = 0; | |
| 957 kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name | |
| 958 kputw(c->flag, str); kputc('\t', str); // flag | |
| 959 if (c->tid >= 0) { // chr | |
| 960 kputs(h->target_name[c->tid] , str); | |
| 961 kputc('\t', str); | |
| 962 } else kputsn("*\t", 2, str); | |
| 963 kputw(c->pos + 1, str); kputc('\t', str); // pos | |
| 964 kputw(c->qual, str); kputc('\t', str); // qual | |
| 965 if (c->n_cigar) { // cigar | |
| 966 uint32_t *cigar = bam_get_cigar(b); | |
| 967 for (i = 0; i < c->n_cigar; ++i) { | |
| 968 kputw(bam_cigar_oplen(cigar[i]), str); | |
| 969 kputc(bam_cigar_opchr(cigar[i]), str); | |
| 970 } | |
| 971 } else kputc('*', str); | |
| 972 kputc('\t', str); | |
| 973 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr | |
| 974 else if (c->mtid == c->tid) kputsn("=\t", 2, str); | |
| 975 else { | |
| 976 kputs(h->target_name[c->mtid], str); | |
| 977 kputc('\t', str); | |
| 978 } | |
| 979 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos | |
| 980 kputw(c->isize, str); kputc('\t', str); // template len | |
| 981 if (c->l_qseq) { // seq and qual | |
| 982 uint8_t *s = bam_get_seq(b); | |
| 983 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str); | |
| 984 kputc('\t', str); | |
| 985 s = bam_get_qual(b); | |
| 986 if (s[0] == 0xff) kputc('*', str); | |
| 987 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str); | |
| 988 } else kputsn("*\t*", 3, str); | |
| 989 s = bam_get_aux(b); // aux | |
| 990 while (s+4 <= b->data + b->l_data) { | |
| 991 uint8_t type, key[2]; | |
| 992 key[0] = s[0]; key[1] = s[1]; | |
| 993 s += 2; type = *s++; | |
| 994 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str); | |
| 995 if (type == 'A') { | |
| 996 kputsn("A:", 2, str); | |
| 997 kputc(*s, str); | |
| 998 ++s; | |
| 999 } else if (type == 'C') { | |
| 1000 kputsn("i:", 2, str); | |
| 1001 kputw(*s, str); | |
| 1002 ++s; | |
| 1003 } else if (type == 'c') { | |
| 1004 kputsn("i:", 2, str); | |
| 1005 kputw(*(int8_t*)s, str); | |
| 1006 ++s; | |
| 1007 } else if (type == 'S') { | |
| 1008 if (s+2 <= b->data + b->l_data) { | |
| 1009 kputsn("i:", 2, str); | |
| 1010 kputw(*(uint16_t*)s, str); | |
| 1011 s += 2; | |
| 1012 } else return -1; | |
| 1013 } else if (type == 's') { | |
| 1014 if (s+2 <= b->data + b->l_data) { | |
| 1015 kputsn("i:", 2, str); | |
| 1016 kputw(*(int16_t*)s, str); | |
| 1017 s += 2; | |
| 1018 } else return -1; | |
| 1019 } else if (type == 'I') { | |
| 1020 if (s+4 <= b->data + b->l_data) { | |
| 1021 kputsn("i:", 2, str); | |
| 1022 kputuw(*(uint32_t*)s, str); | |
| 1023 s += 4; | |
| 1024 } else return -1; | |
| 1025 } else if (type == 'i') { | |
| 1026 if (s+4 <= b->data + b->l_data) { | |
| 1027 kputsn("i:", 2, str); | |
| 1028 kputw(*(int32_t*)s, str); | |
| 1029 s += 4; | |
| 1030 } else return -1; | |
| 1031 } else if (type == 'f') { | |
| 1032 if (s+4 <= b->data + b->l_data) { | |
| 1033 ksprintf(str, "f:%g", *(float*)s); | |
| 1034 s += 4; | |
| 1035 } else return -1; | |
| 1036 | |
| 1037 } else if (type == 'd') { | |
| 1038 if (s+8 <= b->data + b->l_data) { | |
| 1039 ksprintf(str, "d:%g", *(double*)s); | |
| 1040 s += 8; | |
| 1041 } else return -1; | |
| 1042 } else if (type == 'Z' || type == 'H') { | |
| 1043 kputc(type, str); kputc(':', str); | |
| 1044 while (s < b->data + b->l_data && *s) kputc(*s++, str); | |
| 1045 if (s >= b->data + b->l_data) | |
| 1046 return -1; | |
| 1047 ++s; | |
| 1048 } else if (type == 'B') { | |
| 1049 uint8_t sub_type = *(s++); | |
| 1050 int32_t n; | |
| 1051 memcpy(&n, s, 4); | |
| 1052 s += 4; // no point to the start of the array | |
| 1053 if (s + n >= b->data + b->l_data) | |
| 1054 return -1; | |
| 1055 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing | |
| 1056 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if" | |
| 1057 kputc(',', str); | |
| 1058 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; } | |
| 1059 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; } | |
| 1060 else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; } | |
| 1061 else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; } | |
| 1062 else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; } | |
| 1063 else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; } | |
| 1064 else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; } | |
| 1065 } | |
| 1066 } | |
| 1067 } | |
| 1068 return str->l; | |
| 1069 } | |
| 1070 | |
| 1071 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b) | |
| 1072 { | |
| 1073 switch (fp->format.format) { | |
| 1074 case binary_format: | |
| 1075 fp->format.category = sequence_data; | |
| 1076 fp->format.format = bam; | |
| 1077 /* fall-through */ | |
| 1078 case bam: | |
| 1079 return bam_write1(fp->fp.bgzf, b); | |
| 1080 | |
| 1081 case cram: | |
| 1082 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); | |
| 1083 | |
| 1084 case text_format: | |
| 1085 fp->format.category = sequence_data; | |
| 1086 fp->format.format = sam; | |
| 1087 /* fall-through */ | |
| 1088 case sam: | |
| 1089 if (sam_format1(h, b, &fp->line) < 0) return -1; | |
| 1090 kputc('\n', &fp->line); | |
| 1091 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; | |
| 1092 return fp->line.l; | |
| 1093 | |
| 1094 default: | |
| 1095 abort(); | |
| 1096 } | |
| 1097 } | |
| 1098 | |
| 1099 /************************ | |
| 1100 *** Auxiliary fields *** | |
| 1101 ************************/ | |
| 1102 | |
| 1103 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) | |
| 1104 { | |
| 1105 int ori_len = b->l_data; | |
| 1106 b->l_data += 3 + len; | |
| 1107 if (b->m_data < b->l_data) { | |
| 1108 b->m_data = b->l_data; | |
| 1109 kroundup32(b->m_data); | |
| 1110 b->data = (uint8_t*)realloc(b->data, b->m_data); | |
| 1111 } | |
| 1112 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; | |
| 1113 b->data[ori_len + 2] = type; | |
| 1114 memcpy(b->data + ori_len + 3, data, len); | |
| 1115 } | |
| 1116 | |
| 1117 static inline uint8_t *skip_aux(uint8_t *s) | |
| 1118 { | |
| 1119 int size = aux_type2size(*s); ++s; // skip type | |
| 1120 uint32_t n; | |
| 1121 switch (size) { | |
| 1122 case 'Z': | |
| 1123 case 'H': | |
| 1124 while (*s) ++s; | |
| 1125 return s + 1; | |
| 1126 case 'B': | |
| 1127 size = aux_type2size(*s); ++s; | |
| 1128 memcpy(&n, s, 4); s += 4; | |
| 1129 return s + size * n; | |
| 1130 case 0: | |
| 1131 abort(); | |
| 1132 break; | |
| 1133 default: | |
| 1134 return s + size; | |
| 1135 } | |
| 1136 } | |
| 1137 | |
| 1138 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) | |
| 1139 { | |
| 1140 uint8_t *s; | |
| 1141 int y = tag[0]<<8 | tag[1]; | |
| 1142 s = bam_get_aux(b); | |
| 1143 while (s < b->data + b->l_data) { | |
| 1144 int x = (int)s[0]<<8 | s[1]; | |
| 1145 s += 2; | |
| 1146 if (x == y) return s; | |
| 1147 s = skip_aux(s); | |
| 1148 } | |
| 1149 return 0; | |
| 1150 } | |
| 1151 // s MUST BE returned by bam_aux_get() | |
| 1152 int bam_aux_del(bam1_t *b, uint8_t *s) | |
| 1153 { | |
| 1154 uint8_t *p, *aux; | |
| 1155 int l_aux = bam_get_l_aux(b); | |
| 1156 aux = bam_get_aux(b); | |
| 1157 p = s - 2; | |
| 1158 s = skip_aux(s); | |
| 1159 memmove(p, s, l_aux - (s - aux)); | |
| 1160 b->l_data -= s - p; | |
| 1161 return 0; | |
| 1162 } | |
| 1163 | |
| 1164 int32_t bam_aux2i(const uint8_t *s) | |
| 1165 { | |
| 1166 int type; | |
| 1167 type = *s++; | |
| 1168 if (type == 'c') return (int32_t)*(int8_t*)s; | |
| 1169 else if (type == 'C') return (int32_t)*(uint8_t*)s; | |
| 1170 else if (type == 's') return (int32_t)*(int16_t*)s; | |
| 1171 else if (type == 'S') return (int32_t)*(uint16_t*)s; | |
| 1172 else if (type == 'i' || type == 'I') return *(int32_t*)s; | |
| 1173 else return 0; | |
| 1174 } | |
| 1175 | |
| 1176 double bam_aux2f(const uint8_t *s) | |
| 1177 { | |
| 1178 int type; | |
| 1179 type = *s++; | |
| 1180 if (type == 'd') return *(double*)s; | |
| 1181 else if (type == 'f') return *(float*)s; | |
| 1182 else return 0.0; | |
| 1183 } | |
| 1184 | |
| 1185 char bam_aux2A(const uint8_t *s) | |
| 1186 { | |
| 1187 int type; | |
| 1188 type = *s++; | |
| 1189 if (type == 'A') return *(char*)s; | |
| 1190 else return 0; | |
| 1191 } | |
| 1192 | |
| 1193 char *bam_aux2Z(const uint8_t *s) | |
| 1194 { | |
| 1195 int type; | |
| 1196 type = *s++; | |
| 1197 if (type == 'Z' || type == 'H') return (char*)s; | |
| 1198 else return 0; | |
| 1199 } | |
| 1200 | |
| 1201 int sam_open_mode(char *mode, const char *fn, const char *format) | |
| 1202 { | |
| 1203 // TODO Parse "bam5" etc for compression level | |
| 1204 if (format == NULL) { | |
| 1205 // Try to pick a format based on the filename extension | |
| 1206 const char *ext = fn? strrchr(fn, '.') : NULL; | |
| 1207 if (ext == NULL || strchr(ext, '/')) return -1; | |
| 1208 return sam_open_mode(mode, fn, ext+1); | |
| 1209 } | |
| 1210 else if (strcmp(format, "bam") == 0) strcpy(mode, "b"); | |
| 1211 else if (strcmp(format, "cram") == 0) strcpy(mode, "c"); | |
| 1212 else if (strcmp(format, "sam") == 0) strcpy(mode, ""); | |
| 1213 else return -1; | |
| 1214 | |
| 1215 return 0; | |
| 1216 } | |
| 1217 | |
| 1218 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) | |
| 1219 int bam_str2flag(const char *str) | |
| 1220 { | |
| 1221 char *end, *beg = (char*) str; | |
| 1222 long int flag = strtol(str, &end, 0); | |
| 1223 if ( end!=str ) return flag; // the conversion was successful | |
| 1224 flag = 0; | |
| 1225 while ( *str ) | |
| 1226 { | |
| 1227 end = beg; | |
| 1228 while ( *end && *end!=',' ) end++; | |
| 1229 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED; | |
| 1230 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR; | |
| 1231 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP; | |
| 1232 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP; | |
| 1233 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE; | |
| 1234 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE; | |
| 1235 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1; | |
| 1236 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2; | |
| 1237 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY; | |
| 1238 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL; | |
| 1239 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP; | |
| 1240 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY; | |
| 1241 else return -1; | |
| 1242 if ( !*end ) break; | |
| 1243 beg = end + 1; | |
| 1244 } | |
| 1245 return flag; | |
| 1246 } | |
| 1247 | |
| 1248 char *bam_flag2str(int flag) | |
| 1249 { | |
| 1250 kstring_t str = {0,0,0}; | |
| 1251 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED"); | |
| 1252 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR"); | |
| 1253 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP"); | |
| 1254 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP"); | |
| 1255 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE"); | |
| 1256 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE"); | |
| 1257 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1"); | |
| 1258 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2"); | |
| 1259 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY"); | |
| 1260 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL"); | |
| 1261 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP"); | |
| 1262 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY"); | |
| 1263 if ( str.l == 0 ) kputsn("", 0, &str); | |
| 1264 return str.s; | |
| 1265 } | |
| 1266 | |
| 1267 | |
| 1268 /************************** | |
| 1269 *** Pileup and Mpileup *** | |
| 1270 **************************/ | |
| 1271 | |
| 1272 #if !defined(BAM_NO_PILEUP) | |
| 1273 | |
| 1274 #include <assert.h> | |
| 1275 | |
| 1276 /******************* | |
| 1277 *** Memory pool *** | |
| 1278 *******************/ | |
| 1279 | |
| 1280 typedef struct { | |
| 1281 int k, x, y, end; | |
| 1282 } cstate_t; | |
| 1283 | |
| 1284 static cstate_t g_cstate_null = { -1, 0, 0, 0 }; | |
| 1285 | |
| 1286 typedef struct __linkbuf_t { | |
| 1287 bam1_t b; | |
| 1288 int32_t beg, end; | |
| 1289 cstate_t s; | |
| 1290 struct __linkbuf_t *next; | |
| 1291 } lbnode_t; | |
| 1292 | |
| 1293 typedef struct { | |
| 1294 int cnt, n, max; | |
| 1295 lbnode_t **buf; | |
| 1296 } mempool_t; | |
| 1297 | |
| 1298 static mempool_t *mp_init(void) | |
| 1299 { | |
| 1300 mempool_t *mp; | |
| 1301 mp = (mempool_t*)calloc(1, sizeof(mempool_t)); | |
| 1302 return mp; | |
| 1303 } | |
| 1304 static void mp_destroy(mempool_t *mp) | |
| 1305 { | |
| 1306 int k; | |
| 1307 for (k = 0; k < mp->n; ++k) { | |
| 1308 free(mp->buf[k]->b.data); | |
| 1309 free(mp->buf[k]); | |
| 1310 } | |
| 1311 free(mp->buf); | |
| 1312 free(mp); | |
| 1313 } | |
| 1314 static inline lbnode_t *mp_alloc(mempool_t *mp) | |
| 1315 { | |
| 1316 ++mp->cnt; | |
| 1317 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); | |
| 1318 else return mp->buf[--mp->n]; | |
| 1319 } | |
| 1320 static inline void mp_free(mempool_t *mp, lbnode_t *p) | |
| 1321 { | |
| 1322 --mp->cnt; p->next = 0; // clear lbnode_t::next here | |
| 1323 if (mp->n == mp->max) { | |
| 1324 mp->max = mp->max? mp->max<<1 : 256; | |
| 1325 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); | |
| 1326 } | |
| 1327 mp->buf[mp->n++] = p; | |
| 1328 } | |
| 1329 | |
| 1330 /********************** | |
| 1331 *** CIGAR resolver *** | |
| 1332 **********************/ | |
| 1333 | |
| 1334 /* s->k: the index of the CIGAR operator that has just been processed. | |
| 1335 s->x: the reference coordinate of the start of s->k | |
| 1336 s->y: the query coordiante of the start of s->k | |
| 1337 */ | |
| 1338 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) | |
| 1339 { | |
| 1340 #define _cop(c) ((c)&BAM_CIGAR_MASK) | |
| 1341 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) | |
| 1342 | |
| 1343 bam1_t *b = p->b; | |
| 1344 bam1_core_t *c = &b->core; | |
| 1345 uint32_t *cigar = bam_get_cigar(b); | |
| 1346 int k; | |
| 1347 // determine the current CIGAR operation | |
| 1348 // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); | |
| 1349 if (s->k == -1) { // never processed | |
| 1350 if (c->n_cigar == 1) { // just one operation, save a loop | |
| 1351 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; | |
| 1352 } else { // find the first match or deletion | |
| 1353 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { | |
| 1354 int op = _cop(cigar[k]); | |
| 1355 int l = _cln(cigar[k]); | |
| 1356 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; | |
| 1357 else if (op == BAM_CREF_SKIP) s->x += l; | |
| 1358 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; | |
| 1359 } | |
| 1360 assert(k < c->n_cigar); | |
| 1361 s->k = k; | |
| 1362 } | |
| 1363 } else { // the read has been processed before | |
| 1364 int op, l = _cln(cigar[s->k]); | |
| 1365 if (pos - s->x >= l) { // jump to the next operation | |
| 1366 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case | |
| 1367 op = _cop(cigar[s->k+1]); | |
| 1368 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop | |
| 1369 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; | |
| 1370 s->x += l; | |
| 1371 ++s->k; | |
| 1372 } else { // find the next M/D/N/=/X | |
| 1373 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; | |
| 1374 s->x += l; | |
| 1375 for (k = s->k + 1; k < c->n_cigar; ++k) { | |
| 1376 op = _cop(cigar[k]), l = _cln(cigar[k]); | |
| 1377 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; | |
| 1378 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; | |
| 1379 } | |
| 1380 s->k = k; | |
| 1381 } | |
| 1382 assert(s->k < c->n_cigar); // otherwise a bug | |
| 1383 } // else, do nothing | |
| 1384 } | |
| 1385 { // collect pileup information | |
| 1386 int op, l; | |
| 1387 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); | |
| 1388 p->is_del = p->indel = p->is_refskip = 0; | |
| 1389 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation | |
| 1390 int op2 = _cop(cigar[s->k+1]); | |
| 1391 int l2 = _cln(cigar[s->k+1]); | |
| 1392 if (op2 == BAM_CDEL) p->indel = -(int)l2; | |
| 1393 else if (op2 == BAM_CINS) p->indel = l2; | |
| 1394 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding | |
| 1395 int l3 = 0; | |
| 1396 for (k = s->k + 2; k < c->n_cigar; ++k) { | |
| 1397 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); | |
| 1398 if (op2 == BAM_CINS) l3 += l2; | |
| 1399 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; | |
| 1400 } | |
| 1401 if (l3 > 0) p->indel = l3; | |
| 1402 } | |
| 1403 } | |
| 1404 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { | |
| 1405 p->qpos = s->y + (pos - s->x); | |
| 1406 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { | |
| 1407 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! | |
| 1408 p->is_refskip = (op == BAM_CREF_SKIP); | |
| 1409 } // cannot be other operations; otherwise a bug | |
| 1410 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); | |
| 1411 } | |
| 1412 return 1; | |
| 1413 } | |
| 1414 | |
| 1415 /*********************** | |
| 1416 *** Pileup iterator *** | |
| 1417 ***********************/ | |
| 1418 | |
| 1419 // Dictionary of overlapping reads | |
| 1420 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) | |
| 1421 typedef khash_t(olap_hash) olap_hash_t; | |
| 1422 | |
| 1423 struct __bam_plp_t { | |
| 1424 mempool_t *mp; | |
| 1425 lbnode_t *head, *tail, *dummy; | |
| 1426 int32_t tid, pos, max_tid, max_pos; | |
| 1427 int is_eof, max_plp, error, maxcnt; | |
| 1428 uint64_t id; | |
| 1429 bam_pileup1_t *plp; | |
| 1430 // for the "auto" interface only | |
| 1431 bam1_t *b; | |
| 1432 bam_plp_auto_f func; | |
| 1433 void *data; | |
| 1434 olap_hash_t *overlaps; | |
| 1435 }; | |
| 1436 | |
| 1437 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) | |
| 1438 { | |
| 1439 bam_plp_t iter; | |
| 1440 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t)); | |
| 1441 iter->mp = mp_init(); | |
| 1442 iter->head = iter->tail = mp_alloc(iter->mp); | |
| 1443 iter->dummy = mp_alloc(iter->mp); | |
| 1444 iter->max_tid = iter->max_pos = -1; | |
| 1445 iter->maxcnt = 8000; | |
| 1446 if (func) { | |
| 1447 iter->func = func; | |
| 1448 iter->data = data; | |
| 1449 iter->b = bam_init1(); | |
| 1450 } | |
| 1451 return iter; | |
| 1452 } | |
| 1453 | |
| 1454 void bam_plp_init_overlaps(bam_plp_t iter) | |
| 1455 { | |
| 1456 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads | |
| 1457 } | |
| 1458 | |
| 1459 void bam_plp_destroy(bam_plp_t iter) | |
| 1460 { | |
| 1461 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps); | |
| 1462 mp_free(iter->mp, iter->dummy); | |
| 1463 mp_free(iter->mp, iter->head); | |
| 1464 if (iter->mp->cnt != 0) | |
| 1465 fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt); | |
| 1466 mp_destroy(iter->mp); | |
| 1467 if (iter->b) bam_destroy1(iter->b); | |
| 1468 free(iter->plp); | |
| 1469 free(iter); | |
| 1470 } | |
| 1471 | |
| 1472 | |
| 1473 //--------------------------------- | |
| 1474 //--- Tweak overlapping reads | |
| 1475 //--------------------------------- | |
| 1476 | |
| 1477 /** | |
| 1478 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index | |
| 1479 * cigar_iref2iseq_next() - get the next CMATCH base | |
| 1480 * @cigar: pointer to current cigar block (rw) | |
| 1481 * @cigar_max: pointer just beyond the last cigar block | |
| 1482 * @icig: position within the current cigar block (rw) | |
| 1483 * @iseq: position in the sequence (rw) | |
| 1484 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) | |
| 1485 * | |
| 1486 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered. | |
| 1487 */ | |
| 1488 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) | |
| 1489 { | |
| 1490 int pos = *iref; | |
| 1491 if ( pos < 0 ) return -1; | |
| 1492 *icig = 0; | |
| 1493 *iseq = 0; | |
| 1494 *iref = 0; | |
| 1495 while ( *cigar<cigar_max ) | |
| 1496 { | |
| 1497 int cig = (**cigar) & BAM_CIGAR_MASK; | |
| 1498 int ncig = (**cigar) >> BAM_CIGAR_SHIFT; | |
| 1499 | |
| 1500 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } | |
| 1501 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } | |
| 1502 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) | |
| 1503 { | |
| 1504 pos -= ncig; | |
| 1505 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; } | |
| 1506 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig; | |
| 1507 continue; | |
| 1508 } | |
| 1509 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } | |
| 1510 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) | |
| 1511 { | |
| 1512 pos -= ncig; | |
| 1513 if ( pos<0 ) pos = 0; | |
| 1514 (*cigar)++; *icig = 0; *iref += ncig; | |
| 1515 continue; | |
| 1516 } | |
| 1517 fprintf(stderr,"todo: cigar %d\n", cig); | |
| 1518 assert(0); | |
| 1519 } | |
| 1520 *iseq = -1; | |
| 1521 return -1; | |
| 1522 } | |
| 1523 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) | |
| 1524 { | |
| 1525 while ( *cigar < cigar_max ) | |
| 1526 { | |
| 1527 int cig = (**cigar) & BAM_CIGAR_MASK; | |
| 1528 int ncig = (**cigar) >> BAM_CIGAR_SHIFT; | |
| 1529 | |
| 1530 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) | |
| 1531 { | |
| 1532 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; } | |
| 1533 (*iseq)++; (*icig)++; (*iref)++; | |
| 1534 return BAM_CMATCH; | |
| 1535 } | |
| 1536 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; } | |
| 1537 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } | |
| 1538 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } | |
| 1539 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } | |
| 1540 fprintf(stderr,"todo: cigar %d\n", cig); | |
| 1541 assert(0); | |
| 1542 } | |
| 1543 *iseq = -1; | |
| 1544 *iref = -1; | |
| 1545 return -1; | |
| 1546 } | |
| 1547 | |
| 1548 static void tweak_overlap_quality(bam1_t *a, bam1_t *b) | |
| 1549 { | |
| 1550 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; | |
| 1551 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; | |
| 1552 int a_icig = 0, a_iseq = 0; | |
| 1553 int b_icig = 0, b_iseq = 0; | |
| 1554 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); | |
| 1555 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); | |
| 1556 | |
| 1557 int iref = b->core.pos; | |
| 1558 int a_iref = iref - a->core.pos; | |
| 1559 int b_iref = iref - b->core.pos; | |
| 1560 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); | |
| 1561 if ( a_ret<0 ) return; // no overlap | |
| 1562 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); | |
| 1563 if ( b_ret<0 ) return; // no overlap | |
| 1564 | |
| 1565 #if DBG | |
| 1566 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, | |
| 1567 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); | |
| 1568 #endif | |
| 1569 | |
| 1570 while ( 1 ) | |
| 1571 { | |
| 1572 // Increment reference position | |
| 1573 while ( a_iref>=0 && a_iref < iref - a->core.pos ) | |
| 1574 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); | |
| 1575 if ( a_ret<0 ) break; // done | |
| 1576 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos; | |
| 1577 | |
| 1578 while ( b_iref>=0 && b_iref < iref - b->core.pos ) | |
| 1579 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); | |
| 1580 if ( b_ret<0 ) break; // done | |
| 1581 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos; | |
| 1582 | |
| 1583 iref++; | |
| 1584 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels | |
| 1585 | |
| 1586 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) | |
| 1587 { | |
| 1588 #if DBG | |
| 1589 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]); | |
| 1590 #endif | |
| 1591 // we are very confident about this base | |
| 1592 int qual = a_qual[a_iseq] + b_qual[b_iseq]; | |
| 1593 a_qual[a_iseq] = qual>200 ? 200 : qual; | |
| 1594 b_qual[b_iseq] = 0; | |
| 1595 } | |
| 1596 else | |
| 1597 { | |
| 1598 if ( a_qual[a_iseq] >= b_qual[b_iseq] ) | |
| 1599 { | |
| 1600 #if DBG | |
| 1601 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)])); | |
| 1602 #endif | |
| 1603 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch | |
| 1604 b_qual[b_iseq] = 0; | |
| 1605 } | |
| 1606 else | |
| 1607 { | |
| 1608 #if DBG | |
| 1609 fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]); | |
| 1610 #endif | |
| 1611 b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; | |
| 1612 a_qual[a_iseq] = 0; | |
| 1613 } | |
| 1614 } | |
| 1615 } | |
| 1616 #if DBG | |
| 1617 fprintf(stderr,"\n"); | |
| 1618 #endif | |
| 1619 } | |
| 1620 | |
| 1621 // Fix overlapping reads. Simple soft-clipping did not give good results. | |
| 1622 // Lowering qualities of unwanted bases is more selective and works better. | |
| 1623 // | |
| 1624 static void overlap_push(bam_plp_t iter, lbnode_t *node) | |
| 1625 { | |
| 1626 if ( !iter->overlaps ) return; | |
| 1627 | |
| 1628 // mapped mates and paired reads only | |
| 1629 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return; | |
| 1630 | |
| 1631 // no overlap possible, unless some wild cigar | |
| 1632 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return; | |
| 1633 | |
| 1634 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); | |
| 1635 if ( kitr==kh_end(iter->overlaps) ) | |
| 1636 { | |
| 1637 int ret; | |
| 1638 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); | |
| 1639 kh_value(iter->overlaps, kitr) = node; | |
| 1640 } | |
| 1641 else | |
| 1642 { | |
| 1643 lbnode_t *a = kh_value(iter->overlaps, kitr); | |
| 1644 tweak_overlap_quality(&a->b, &node->b); | |
| 1645 kh_del(olap_hash, iter->overlaps, kitr); | |
| 1646 assert(a->end-1 == a->s.end); | |
| 1647 a->end = a->b.core.pos + bam_cigar2rlen(a->b.core.n_cigar, bam_get_cigar(&a->b)); | |
| 1648 a->s.end = a->end - 1; | |
| 1649 } | |
| 1650 } | |
| 1651 | |
| 1652 static void overlap_remove(bam_plp_t iter, const bam1_t *b) | |
| 1653 { | |
| 1654 if ( !iter->overlaps ) return; | |
| 1655 | |
| 1656 khiter_t kitr; | |
| 1657 if ( b ) | |
| 1658 { | |
| 1659 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b)); | |
| 1660 if ( kitr!=kh_end(iter->overlaps) ) | |
| 1661 kh_del(olap_hash, iter->overlaps, kitr); | |
| 1662 } | |
| 1663 else | |
| 1664 { | |
| 1665 // remove all | |
| 1666 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++) | |
| 1667 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr); | |
| 1668 } | |
| 1669 } | |
| 1670 | |
| 1671 | |
| 1672 | |
| 1673 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns | |
| 1674 // pointer to the piled records if next position is ready or NULL if there is not enough records in the | |
| 1675 // buffer yet (the current position is still the maximum position across all buffered reads). | |
| 1676 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) | |
| 1677 { | |
| 1678 if (iter->error) { *_n_plp = -1; return 0; } | |
| 1679 *_n_plp = 0; | |
| 1680 if (iter->is_eof && iter->head->next == 0) return 0; | |
| 1681 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { | |
| 1682 int n_plp = 0; | |
| 1683 lbnode_t *p, *q; | |
| 1684 // write iter->plp at iter->pos | |
| 1685 iter->dummy->next = iter->head; | |
| 1686 for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) { | |
| 1687 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove | |
| 1688 overlap_remove(iter, &p->b); | |
| 1689 q->next = p->next; mp_free(iter->mp, p); p = q; | |
| 1690 } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup | |
| 1691 if (n_plp == iter->max_plp) { // then double the capacity | |
| 1692 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; | |
| 1693 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); | |
| 1694 } | |
| 1695 iter->plp[n_plp].b = &p->b; | |
| 1696 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... | |
| 1697 } | |
| 1698 } | |
| 1699 iter->head = iter->dummy->next; // dummy->next may be changed | |
| 1700 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; | |
| 1701 // update iter->tid and iter->pos | |
| 1702 if (iter->head->next) { | |
| 1703 if (iter->tid > iter->head->b.core.tid) { | |
| 1704 fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__); | |
| 1705 iter->error = 1; | |
| 1706 *_n_plp = -1; | |
| 1707 return 0; | |
| 1708 } | |
| 1709 } | |
| 1710 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence | |
| 1711 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference | |
| 1712 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid | |
| 1713 iter->pos = iter->head->beg; // jump to the next position | |
| 1714 } else ++iter->pos; // scan contiguously | |
| 1715 // return | |
| 1716 if (n_plp) return iter->plp; | |
| 1717 if (iter->is_eof && iter->head->next == 0) break; | |
| 1718 } | |
| 1719 return 0; | |
| 1720 } | |
| 1721 | |
| 1722 int bam_plp_push(bam_plp_t iter, const bam1_t *b) | |
| 1723 { | |
| 1724 if (iter->error) return -1; | |
| 1725 if (b) { | |
| 1726 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } | |
| 1727 // Skip only unmapped reads here, any additional filtering must be done in iter->func | |
| 1728 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } | |
| 1729 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) | |
| 1730 { | |
| 1731 overlap_remove(iter, b); | |
| 1732 return 0; | |
| 1733 } | |
| 1734 bam_copy1(&iter->tail->b, b); | |
| 1735 overlap_push(iter, iter->tail); | |
| 1736 #ifndef BAM_NO_ID | |
| 1737 iter->tail->b.id = iter->id++; | |
| 1738 #endif | |
| 1739 iter->tail->beg = b->core.pos; | |
| 1740 iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); | |
| 1741 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t | |
| 1742 if (b->core.tid < iter->max_tid) { | |
| 1743 fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); | |
| 1744 iter->error = 1; | |
| 1745 return -1; | |
| 1746 } | |
| 1747 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { | |
| 1748 fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); | |
| 1749 iter->error = 1; | |
| 1750 return -1; | |
| 1751 } | |
| 1752 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; | |
| 1753 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { | |
| 1754 iter->tail->next = mp_alloc(iter->mp); | |
| 1755 iter->tail = iter->tail->next; | |
| 1756 } | |
| 1757 } else iter->is_eof = 1; | |
| 1758 return 0; | |
| 1759 } | |
| 1760 | |
| 1761 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) | |
| 1762 { | |
| 1763 const bam_pileup1_t *plp; | |
| 1764 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } | |
| 1765 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; | |
| 1766 else { // no pileup line can be obtained; read alignments | |
| 1767 *_n_plp = 0; | |
| 1768 if (iter->is_eof) return 0; | |
| 1769 int ret; | |
| 1770 while ( (ret=iter->func(iter->data, iter->b)) >= 0) { | |
| 1771 if (bam_plp_push(iter, iter->b) < 0) { | |
| 1772 *_n_plp = -1; | |
| 1773 return 0; | |
| 1774 } | |
| 1775 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; | |
| 1776 // otherwise no pileup line can be returned; read the next alignment. | |
| 1777 } | |
| 1778 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } | |
| 1779 bam_plp_push(iter, 0); | |
| 1780 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; | |
| 1781 return 0; | |
| 1782 } | |
| 1783 } | |
| 1784 | |
| 1785 void bam_plp_reset(bam_plp_t iter) | |
| 1786 { | |
| 1787 lbnode_t *p, *q; | |
| 1788 iter->max_tid = iter->max_pos = -1; | |
| 1789 iter->tid = iter->pos = 0; | |
| 1790 iter->is_eof = 0; | |
| 1791 for (p = iter->head; p->next;) { | |
| 1792 overlap_remove(iter, NULL); | |
| 1793 q = p->next; | |
| 1794 mp_free(iter->mp, p); | |
| 1795 p = q; | |
| 1796 } | |
| 1797 iter->head = iter->tail; | |
| 1798 } | |
| 1799 | |
| 1800 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) | |
| 1801 { | |
| 1802 iter->maxcnt = maxcnt; | |
| 1803 } | |
| 1804 | |
| 1805 /************************ | |
| 1806 *** Mpileup iterator *** | |
| 1807 ************************/ | |
| 1808 | |
| 1809 struct __bam_mplp_t { | |
| 1810 int n; | |
| 1811 uint64_t min, *pos; | |
| 1812 bam_plp_t *iter; | |
| 1813 int *n_plp; | |
| 1814 const bam_pileup1_t **plp; | |
| 1815 }; | |
| 1816 | |
| 1817 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) | |
| 1818 { | |
| 1819 int i; | |
| 1820 bam_mplp_t iter; | |
| 1821 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); | |
| 1822 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t)); | |
| 1823 iter->n_plp = (int*)calloc(n, sizeof(int)); | |
| 1824 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); | |
| 1825 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); | |
| 1826 iter->n = n; | |
| 1827 iter->min = (uint64_t)-1; | |
| 1828 for (i = 0; i < n; ++i) { | |
| 1829 iter->iter[i] = bam_plp_init(func, data[i]); | |
| 1830 iter->pos[i] = iter->min; | |
| 1831 } | |
| 1832 return iter; | |
| 1833 } | |
| 1834 | |
| 1835 void bam_mplp_init_overlaps(bam_mplp_t iter) | |
| 1836 { | |
| 1837 int i; | |
| 1838 for (i = 0; i < iter->n; ++i) | |
| 1839 bam_plp_init_overlaps(iter->iter[i]); | |
| 1840 } | |
| 1841 | |
| 1842 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) | |
| 1843 { | |
| 1844 int i; | |
| 1845 for (i = 0; i < iter->n; ++i) | |
| 1846 iter->iter[i]->maxcnt = maxcnt; | |
| 1847 } | |
| 1848 | |
| 1849 void bam_mplp_destroy(bam_mplp_t iter) | |
| 1850 { | |
| 1851 int i; | |
| 1852 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); | |
| 1853 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); | |
| 1854 free(iter); | |
| 1855 } | |
| 1856 | |
| 1857 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) | |
| 1858 { | |
| 1859 int i, ret = 0; | |
| 1860 uint64_t new_min = (uint64_t)-1; | |
| 1861 for (i = 0; i < iter->n; ++i) { | |
| 1862 if (iter->pos[i] == iter->min) { | |
| 1863 int tid, pos; | |
| 1864 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); | |
| 1865 if ( iter->iter[i]->error ) return -1; | |
| 1866 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0; | |
| 1867 } | |
| 1868 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; | |
| 1869 } | |
| 1870 iter->min = new_min; | |
| 1871 if (new_min == (uint64_t)-1) return 0; | |
| 1872 *_tid = new_min>>32; *_pos = (uint32_t)new_min; | |
| 1873 for (i = 0; i < iter->n; ++i) { | |
| 1874 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" | |
| 1875 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; | |
| 1876 ++ret; | |
| 1877 } else n_plp[i] = 0, plp[i] = 0; | |
| 1878 } | |
| 1879 return ret; | |
| 1880 } | |
| 1881 | |
| 1882 #endif // ~!defined(BAM_NO_PILEUP) |
