Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/vcfutils.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* vcfutils.c -- allele-related utility functions. | |
| 2 | |
| 3 Copyright (C) 2012-2014 Genome Research Ltd. | |
| 4 | |
| 5 Author: Petr Danecek <pd3@sanger.ac.uk> | |
| 6 | |
| 7 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 8 of this software and associated documentation files (the "Software"), to deal | |
| 9 in the Software without restriction, including without limitation the rights | |
| 10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 11 copies of the Software, and to permit persons to whom the Software is | |
| 12 furnished to do so, subject to the following conditions: | |
| 13 | |
| 14 The above copyright notice and this permission notice shall be included in | |
| 15 all copies or substantial portions of the Software. | |
| 16 | |
| 17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
| 20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 23 DEALINGS IN THE SOFTWARE. */ | |
| 24 | |
| 25 #include "htslib/vcfutils.h" | |
| 26 | |
| 27 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) | |
| 28 { | |
| 29 int i; | |
| 30 for (i=0; i<line->n_allele; i++) ac[i]=0; | |
| 31 | |
| 32 // Use INFO/AC,AN field only when asked | |
| 33 if ( which&BCF_UN_INFO ) | |
| 34 { | |
| 35 bcf_unpack(line, BCF_UN_INFO); | |
| 36 int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); | |
| 37 int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); | |
| 38 int i, an=-1, ac_len=0, ac_type=0; | |
| 39 uint8_t *ac_ptr=NULL; | |
| 40 if ( an_id>=0 && ac_id>=0 ) | |
| 41 { | |
| 42 for (i=0; i<line->n_info; i++) | |
| 43 { | |
| 44 bcf_info_t *z = &line->d.info[i]; | |
| 45 if ( z->key == an_id ) an = z->v1.i; | |
| 46 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } | |
| 47 } | |
| 48 } | |
| 49 if ( an>=0 && ac_ptr ) | |
| 50 { | |
| 51 int nac = 0; | |
| 52 #define BRANCH_INT(type_t) { \ | |
| 53 type_t *p = (type_t *) ac_ptr; \ | |
| 54 for (i=0; i<ac_len; i++) \ | |
| 55 { \ | |
| 56 ac[i+1] = p[i]; \ | |
| 57 nac += p[i]; \ | |
| 58 } \ | |
| 59 } | |
| 60 switch (ac_type) { | |
| 61 case BCF_BT_INT8: BRANCH_INT(int8_t); break; | |
| 62 case BCF_BT_INT16: BRANCH_INT(int16_t); break; | |
| 63 case BCF_BT_INT32: BRANCH_INT(int32_t); break; | |
| 64 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; | |
| 65 } | |
| 66 #undef BRANCH_INT | |
| 67 if ( an<nac ) | |
| 68 { | |
| 69 fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1); | |
| 70 exit(1); | |
| 71 } | |
| 72 ac[0] = an - nac; | |
| 73 return 1; | |
| 74 } | |
| 75 } | |
| 76 | |
| 77 // Split genotype fields only when asked | |
| 78 if ( which&BCF_UN_FMT ) | |
| 79 { | |
| 80 int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); | |
| 81 if ( gt_id<0 ) return 0; | |
| 82 bcf_unpack(line, BCF_UN_FMT); | |
| 83 bcf_fmt_t *fmt_gt = NULL; | |
| 84 for (i=0; i<(int)line->n_fmt; i++) | |
| 85 if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } | |
| 86 if ( !fmt_gt ) return 0; | |
| 87 #define BRANCH_INT(type_t,vector_end) { \ | |
| 88 for (i=0; i<line->n_sample; i++) \ | |
| 89 { \ | |
| 90 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ | |
| 91 int ial; \ | |
| 92 for (ial=0; ial<fmt_gt->n; ial++) \ | |
| 93 { \ | |
| 94 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ | |
| 95 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ | |
| 96 if ( p[ial]>>1 > line->n_allele ) \ | |
| 97 { \ | |
| 98 fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ | |
| 99 exit(1); \ | |
| 100 } \ | |
| 101 ac[(p[ial]>>1)-1]++; \ | |
| 102 } \ | |
| 103 } \ | |
| 104 } | |
| 105 switch (fmt_gt->type) { | |
| 106 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; | |
| 107 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; | |
| 108 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; | |
| 109 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; | |
| 110 } | |
| 111 #undef BRANCH_INT | |
| 112 return 1; | |
| 113 } | |
| 114 return 0; | |
| 115 } | |
| 116 | |
| 117 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) | |
| 118 { | |
| 119 int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; | |
| 120 #define BRANCH_INT(type_t,vector_end) { \ | |
| 121 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ | |
| 122 for (i=0; i<fmt_ptr->n; i++) \ | |
| 123 { \ | |
| 124 if ( p[i] == vector_end ) break; /* smaller ploidy */ \ | |
| 125 if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \ | |
| 126 int tmp = p[i]>>1; \ | |
| 127 if ( tmp>1 ) \ | |
| 128 { \ | |
| 129 if ( !ial ) { ial = tmp; has_alt = 1; } \ | |
| 130 else if ( tmp!=ial ) \ | |
| 131 { \ | |
| 132 if ( tmp<ial ) \ | |
| 133 { \ | |
| 134 jal = ial; \ | |
| 135 ial = tmp; \ | |
| 136 } \ | |
| 137 else \ | |
| 138 { \ | |
| 139 jal = tmp; \ | |
| 140 } \ | |
| 141 has_alt = 2; \ | |
| 142 } \ | |
| 143 } \ | |
| 144 else has_ref = 1; \ | |
| 145 nals++; \ | |
| 146 } \ | |
| 147 } | |
| 148 switch (fmt_ptr->type) { | |
| 149 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; | |
| 150 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; | |
| 151 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; | |
| 152 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; | |
| 153 } | |
| 154 #undef BRANCH_INT | |
| 155 | |
| 156 if ( _ial ) *_ial = ial>0 ? ial-1 : ial; | |
| 157 if ( _jal ) *_jal = jal>0 ? jal-1 : jal; | |
| 158 if ( !nals ) return GT_UNKN; | |
| 159 if ( nals==1 ) | |
| 160 return has_ref ? GT_HAPL_R : GT_HAPL_A; | |
| 161 if ( !has_ref ) | |
| 162 return has_alt==1 ? GT_HOM_AA : GT_HET_AA; | |
| 163 if ( !has_alt ) | |
| 164 return GT_HOM_RR; | |
| 165 return GT_HET_RA; | |
| 166 } | |
| 167 | |
| 168 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) | |
| 169 { | |
| 170 int i; | |
| 171 bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT"); | |
| 172 if ( !gt ) return 0; | |
| 173 | |
| 174 int *ac = (int*) calloc(line->n_allele,sizeof(int)); | |
| 175 | |
| 176 // check if all alleles are populated | |
| 177 #define BRANCH(type_t,vector_end) { \ | |
| 178 for (i=0; i<line->n_sample; i++) \ | |
| 179 { \ | |
| 180 type_t *p = (type_t*) (gt->p + i*gt->size); \ | |
| 181 int ial; \ | |
| 182 for (ial=0; ial<gt->n; ial++) \ | |
| 183 { \ | |
| 184 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ | |
| 185 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ | |
| 186 if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \ | |
| 187 ac[(p[ial]>>1)-1]++; \ | |
| 188 } \ | |
| 189 } \ | |
| 190 } | |
| 191 switch (gt->type) { | |
| 192 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; | |
| 193 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; | |
| 194 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; | |
| 195 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; | |
| 196 } | |
| 197 #undef BRANCH | |
| 198 | |
| 199 int rm_als = 0, nrm = 0; | |
| 200 for (i=1; i<line->n_allele; i++) | |
| 201 { | |
| 202 if ( !ac[i] ) { rm_als |= 1<<i; nrm++; } | |
| 203 } | |
| 204 free(ac); | |
| 205 | |
| 206 if ( nrm ) bcf_remove_alleles(header, line, rm_als); | |
| 207 return nrm; | |
| 208 } | |
| 209 | |
| 210 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) | |
| 211 { | |
| 212 int *map = (int*) calloc(line->n_allele, sizeof(int)); | |
| 213 | |
| 214 // create map of indexes from old to new ALT numbering and modify ALT | |
| 215 kstring_t str = {0,0,0}; | |
| 216 kputs(line->d.allele[0], &str); | |
| 217 | |
| 218 int nrm = 0, i,j; // i: ori alleles, j: new alleles | |
| 219 for (i=1, j=1; i<line->n_allele; i++) | |
| 220 { | |
| 221 if ( rm_mask & 1<<i ) | |
| 222 { | |
| 223 // remove this allele | |
| 224 line->d.allele[i] = NULL; | |
| 225 nrm++; | |
| 226 continue; | |
| 227 } | |
| 228 kputc(',', &str); | |
| 229 kputs(line->d.allele[i], &str); | |
| 230 map[i] = j; | |
| 231 j++; | |
| 232 } | |
| 233 if ( !nrm ) { free(map); free(str.s); return; } | |
| 234 | |
| 235 int nR_ori = line->n_allele; | |
| 236 int nR_new = line->n_allele-nrm; | |
| 237 assert(nR_new > 0); // should not be able to remove reference allele | |
| 238 int nA_ori = nR_ori-1; | |
| 239 int nA_new = nR_new-1; | |
| 240 | |
| 241 int nG_ori = nR_ori*(nR_ori + 1)/2; | |
| 242 int nG_new = nR_new*(nR_new + 1)/2; | |
| 243 | |
| 244 bcf_update_alleles_str(header, line, str.s); | |
| 245 | |
| 246 // remove from Number=G, Number=R and Number=A INFO fields. | |
| 247 uint8_t *dat = NULL; | |
| 248 int mdat = 0, ndat = 0, mdat_bytes = 0, nret; | |
| 249 for (i=0; i<line->n_info; i++) | |
| 250 { | |
| 251 bcf_info_t *info = &line->d.info[i]; | |
| 252 int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); | |
| 253 | |
| 254 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change | |
| 255 | |
| 256 int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); | |
| 257 if ( type==BCF_HT_FLAG ) continue; | |
| 258 int size = 1; | |
| 259 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; | |
| 260 | |
| 261 mdat = mdat_bytes / size; | |
| 262 nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); | |
| 263 mdat_bytes = mdat * size; | |
| 264 if ( nret<0 ) | |
| 265 { | |
| 266 fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
| 267 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); | |
| 268 exit(1); | |
| 269 } | |
| 270 if ( type==BCF_HT_STR ) | |
| 271 { | |
| 272 str.l = 0; | |
| 273 char *ss = (char*) dat, *se = (char*) dat; | |
| 274 if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) | |
| 275 { | |
| 276 int nexp, inc = 0; | |
| 277 if ( vlen==BCF_VL_A ) | |
| 278 { | |
| 279 nexp = nA_ori; | |
| 280 inc = 1; | |
| 281 } | |
| 282 else | |
| 283 nexp = nR_ori; | |
| 284 for (j=0; j<nexp; j++) | |
| 285 { | |
| 286 if ( !*se ) break; | |
| 287 while ( *se && *se!=',' ) se++; | |
| 288 if ( rm_mask & 1<<(j+inc) ) | |
| 289 { | |
| 290 if ( *se ) se++; | |
| 291 ss = se; | |
| 292 continue; | |
| 293 } | |
| 294 if ( str.l ) kputc(',',&str); | |
| 295 kputsn(ss,se-ss,&str); | |
| 296 if ( *se ) se++; | |
| 297 ss = se; | |
| 298 } | |
| 299 assert( j==nexp ); | |
| 300 } | |
| 301 else // Number=G, assuming diploid genotype | |
| 302 { | |
| 303 int k = 0, n = 0; | |
| 304 for (j=0; j<nR_ori; j++) | |
| 305 { | |
| 306 for (k=0; k<=j; k++) | |
| 307 { | |
| 308 if ( !*se ) break; | |
| 309 while ( *se && *se!=',' ) se++; | |
| 310 n++; | |
| 311 if ( rm_mask & 1<<j || rm_mask & 1<<k ) | |
| 312 { | |
| 313 if ( *se ) se++; | |
| 314 ss = se; | |
| 315 continue; | |
| 316 } | |
| 317 if ( str.l ) kputc(',',&str); | |
| 318 kputsn(ss,se-ss,&str); | |
| 319 if ( *se ) se++; | |
| 320 ss = se; | |
| 321 } | |
| 322 if ( !*se ) break; | |
| 323 } | |
| 324 assert( n=nG_ori ); | |
| 325 } | |
| 326 | |
| 327 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); | |
| 328 if ( nret<0 ) | |
| 329 { | |
| 330 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
| 331 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); | |
| 332 exit(1); | |
| 333 } | |
| 334 continue; | |
| 335 } | |
| 336 | |
| 337 if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) | |
| 338 { | |
| 339 int inc = 0, ntop; | |
| 340 if ( vlen==BCF_VL_A ) | |
| 341 { | |
| 342 assert( nret==nA_ori ); | |
| 343 ntop = nA_ori; | |
| 344 ndat = nA_new; | |
| 345 inc = 1; | |
| 346 } | |
| 347 else | |
| 348 { | |
| 349 assert( nret==nR_ori ); | |
| 350 ntop = nR_ori; | |
| 351 ndat = nR_new; | |
| 352 } | |
| 353 int k = 0; | |
| 354 | |
| 355 #define BRANCH(type_t,is_vector_end) \ | |
| 356 { \ | |
| 357 type_t *ptr = (type_t*) dat; \ | |
| 358 int size = sizeof(type_t); \ | |
| 359 for (j=0; j<ntop; j++) /* j:ori, k:new */ \ | |
| 360 { \ | |
| 361 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \ | |
| 362 if ( rm_mask & 1<<(j+inc) ) continue; \ | |
| 363 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \ | |
| 364 k++; \ | |
| 365 } \ | |
| 366 } | |
| 367 switch (type) | |
| 368 { | |
| 369 case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break; | |
| 370 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break; | |
| 371 } | |
| 372 #undef BRANCH | |
| 373 } | |
| 374 else // Number=G | |
| 375 { | |
| 376 assert( nret==nG_ori ); | |
| 377 int k, l_ori = -1, l_new = 0; | |
| 378 ndat = nG_new; | |
| 379 | |
| 380 #define BRANCH(type_t,is_vector_end) \ | |
| 381 { \ | |
| 382 type_t *ptr = (type_t*) dat; \ | |
| 383 int size = sizeof(type_t); \ | |
| 384 for (j=0; j<nR_ori; j++) \ | |
| 385 { \ | |
| 386 for (k=0; k<=j; k++) \ | |
| 387 { \ | |
| 388 l_ori++; \ | |
| 389 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \ | |
| 390 if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \ | |
| 391 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \ | |
| 392 l_new++; \ | |
| 393 } \ | |
| 394 } \ | |
| 395 } | |
| 396 switch (type) | |
| 397 { | |
| 398 case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break; | |
| 399 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break; | |
| 400 } | |
| 401 #undef BRANCH | |
| 402 } | |
| 403 | |
| 404 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); | |
| 405 if ( nret<0 ) | |
| 406 { | |
| 407 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
| 408 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); | |
| 409 exit(1); | |
| 410 } | |
| 411 } | |
| 412 | |
| 413 // Update GT fields, the allele indexes might have changed | |
| 414 for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break; | |
| 415 if ( i<line->n_allele ) | |
| 416 { | |
| 417 mdat = mdat_bytes / 4; // sizeof(int32_t) | |
| 418 nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); | |
| 419 mdat_bytes = mdat * 4; | |
| 420 if ( nret>0 ) | |
| 421 { | |
| 422 nret /= line->n_sample; | |
| 423 int32_t *ptr = (int32_t*) dat; | |
| 424 for (i=0; i<line->n_sample; i++) | |
| 425 { | |
| 426 for (j=0; j<nret; j++) | |
| 427 { | |
| 428 if ( bcf_gt_is_missing(ptr[j]) ) continue; | |
| 429 if ( ptr[j]==bcf_int32_vector_end ) break; | |
| 430 int al = bcf_gt_allele(ptr[j]); | |
| 431 assert( al<nR_ori && map[al]>=0 ); | |
| 432 ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); | |
| 433 } | |
| 434 ptr += nret; | |
| 435 } | |
| 436 bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); | |
| 437 } | |
| 438 } | |
| 439 | |
| 440 // Remove from Number=G, Number=R and Number=A FORMAT fields. | |
| 441 // Assuming haploid or diploid GTs | |
| 442 for (i=0; i<line->n_fmt; i++) | |
| 443 { | |
| 444 bcf_fmt_t *fmt = &line->d.fmt[i]; | |
| 445 int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); | |
| 446 | |
| 447 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change | |
| 448 | |
| 449 int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); | |
| 450 if ( type==BCF_HT_FLAG ) continue; | |
| 451 | |
| 452 int size = 1; | |
| 453 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; | |
| 454 | |
| 455 mdat = mdat_bytes / size; | |
| 456 nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); | |
| 457 mdat_bytes = mdat * size; | |
| 458 if ( nret<0 ) | |
| 459 { | |
| 460 fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
| 461 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); | |
| 462 exit(1); | |
| 463 } | |
| 464 | |
| 465 if ( type==BCF_HT_STR ) | |
| 466 { | |
| 467 int size = nret/line->n_sample; // number of bytes per sample | |
| 468 str.l = 0; | |
| 469 if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) | |
| 470 { | |
| 471 int nexp, inc = 0; | |
| 472 if ( vlen==BCF_VL_A ) | |
| 473 { | |
| 474 nexp = nA_ori; | |
| 475 inc = 1; | |
| 476 } | |
| 477 else | |
| 478 nexp = nR_ori; | |
| 479 for (j=0; j<line->n_sample; j++) | |
| 480 { | |
| 481 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; | |
| 482 int k_src = 0, k_dst = 0, l = str.l; | |
| 483 for (k_src=0; k_src<nexp; k_src++) | |
| 484 { | |
| 485 if ( ptr>=se || !*ptr) break; | |
| 486 while ( ptr<se && *ptr && *ptr!=',' ) ptr++; | |
| 487 if ( rm_mask & 1<<(k_src+inc) ) | |
| 488 { | |
| 489 ss = ++ptr; | |
| 490 continue; | |
| 491 } | |
| 492 if ( k_dst ) kputc(',',&str); | |
| 493 kputsn(ss,ptr-ss,&str); | |
| 494 ss = ++ptr; | |
| 495 k_dst++; | |
| 496 } | |
| 497 assert( k_src==nexp ); | |
| 498 l = str.l - l; | |
| 499 for (; l<size; l++) kputc(0, &str); | |
| 500 } | |
| 501 } | |
| 502 else // Number=G, diploid or haploid | |
| 503 { | |
| 504 for (j=0; j<line->n_sample; j++) | |
| 505 { | |
| 506 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; | |
| 507 int k_src = 0, k_dst = 0, l = str.l; | |
| 508 int nexp = 0; // diploid or haploid? | |
| 509 while ( ptr<se ) | |
| 510 { | |
| 511 if ( !*ptr ) break; | |
| 512 if ( *ptr==',' ) nexp++; | |
| 513 ptr++; | |
| 514 } | |
| 515 if ( ptr!=ss ) nexp++; | |
| 516 assert( nexp==nG_ori || nexp==nR_ori ); | |
| 517 ptr = ss; | |
| 518 if ( nexp==nG_ori ) // diploid | |
| 519 { | |
| 520 int ia, ib; | |
| 521 for (ia=0; ia<nR_ori; ia++) | |
| 522 { | |
| 523 for (ib=0; ib<=ia; ib++) | |
| 524 { | |
| 525 if ( ptr>=se || !*ptr ) break; | |
| 526 while ( ptr<se && *ptr && *ptr!=',' ) ptr++; | |
| 527 if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) | |
| 528 { | |
| 529 ss = ++ptr; | |
| 530 continue; | |
| 531 } | |
| 532 if ( k_dst ) kputc(',',&str); | |
| 533 kputsn(ss,ptr-ss,&str); | |
| 534 ss = ++ptr; | |
| 535 k_dst++; | |
| 536 } | |
| 537 if ( ptr>=se || !*ptr ) break; | |
| 538 } | |
| 539 } | |
| 540 else // haploid | |
| 541 { | |
| 542 for (k_src=0; k_src<nR_ori; k_src++) | |
| 543 { | |
| 544 if ( ptr>=se || !*ptr ) break; | |
| 545 while ( ptr<se && *ptr && *ptr!=',' ) ptr++; | |
| 546 if ( rm_mask & 1<<k_src ) | |
| 547 { | |
| 548 ss = ++ptr; | |
| 549 continue; | |
| 550 } | |
| 551 if ( k_dst ) kputc(',',&str); | |
| 552 kputsn(ss,ptr-ss,&str); | |
| 553 ss = ++ptr; | |
| 554 k_dst++; | |
| 555 } | |
| 556 assert( k_src==nR_ori ); | |
| 557 l = str.l - l; | |
| 558 for (; l<size; l++) kputc(0, &str); | |
| 559 } | |
| 560 } | |
| 561 } | |
| 562 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); | |
| 563 if ( nret<0 ) | |
| 564 { | |
| 565 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
| 566 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); | |
| 567 exit(1); | |
| 568 } | |
| 569 continue; | |
| 570 } | |
| 571 | |
| 572 int nori = nret / line->n_sample; | |
| 573 if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G | |
| 574 { | |
| 575 int inc = 0, nnew; | |
| 576 if ( vlen==BCF_VL_A ) | |
| 577 { | |
| 578 assert( nori==nA_ori ); // todo: will fail if all values are missing | |
| 579 ndat = nA_new*line->n_sample; | |
| 580 nnew = nA_new; | |
| 581 inc = 1; | |
| 582 } | |
| 583 else | |
| 584 { | |
| 585 assert( nori==nR_ori ); // todo: will fail if all values are missing | |
| 586 ndat = nR_new*line->n_sample; | |
| 587 nnew = nR_new; | |
| 588 } | |
| 589 | |
| 590 #define BRANCH(type_t,is_vector_end) \ | |
| 591 { \ | |
| 592 for (j=0; j<line->n_sample; j++) \ | |
| 593 { \ | |
| 594 type_t *ptr_src = ((type_t*)dat) + j*nori; \ | |
| 595 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \ | |
| 596 int size = sizeof(type_t); \ | |
| 597 int k_src, k_dst = 0; \ | |
| 598 for (k_src=0; k_src<nori; k_src++) \ | |
| 599 { \ | |
| 600 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \ | |
| 601 if ( rm_mask & 1<<(k_src+inc) ) continue; \ | |
| 602 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
| 603 k_dst++; \ | |
| 604 } \ | |
| 605 } \ | |
| 606 } | |
| 607 switch (type) | |
| 608 { | |
| 609 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; | |
| 610 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; | |
| 611 } | |
| 612 #undef BRANCH | |
| 613 } | |
| 614 else // Number=G, diploid or mixture of haploid+diploid | |
| 615 { | |
| 616 assert( nori==nG_ori ); | |
| 617 ndat = nG_new*line->n_sample; | |
| 618 | |
| 619 #define BRANCH(type_t,is_vector_end) \ | |
| 620 { \ | |
| 621 for (j=0; j<line->n_sample; j++) \ | |
| 622 { \ | |
| 623 type_t *ptr_src = ((type_t*)dat) + j*nori; \ | |
| 624 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ | |
| 625 int size = sizeof(type_t); \ | |
| 626 int ia, ib, k_dst = 0, k_src; \ | |
| 627 int nset = 0; /* haploid or diploid? */ \ | |
| 628 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \ | |
| 629 if ( nset==nR_ori ) /* haploid */ \ | |
| 630 { \ | |
| 631 for (k_src=0; k_src<nR_ori; k_src++) \ | |
| 632 { \ | |
| 633 if ( rm_mask & 1<<k_src ) continue; \ | |
| 634 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
| 635 k_dst++; \ | |
| 636 } \ | |
| 637 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
| 638 } \ | |
| 639 else /* diploid */ \ | |
| 640 { \ | |
| 641 k_src = -1; \ | |
| 642 for (ia=0; ia<nR_ori; ia++) \ | |
| 643 { \ | |
| 644 for (ib=0; ib<=ia; ib++) \ | |
| 645 { \ | |
| 646 k_src++; \ | |
| 647 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \ | |
| 648 if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \ | |
| 649 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
| 650 k_dst++; \ | |
| 651 } \ | |
| 652 } \ | |
| 653 } \ | |
| 654 } \ | |
| 655 } | |
| 656 switch (type) | |
| 657 { | |
| 658 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; | |
| 659 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; | |
| 660 } | |
| 661 #undef BRANCH | |
| 662 } | |
| 663 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); | |
| 664 if ( nret<0 ) | |
| 665 { | |
| 666 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
| 667 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); | |
| 668 exit(1); | |
| 669 } | |
| 670 } | |
| 671 free(dat); | |
| 672 free(str.s); | |
| 673 free(map); | |
| 674 } | |
| 675 |
