Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/sam_header.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2013 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 #ifdef HAVE_CONFIG_H | |
32 #include "io_lib_config.h" | |
33 #endif | |
34 | |
35 #include <string.h> | |
36 #include <assert.h> | |
37 | |
38 #include "cram/sam_header.h" | |
39 #include "cram/string_alloc.h" | |
40 | |
41 static void sam_hdr_error(char *msg, char *line, int len, int lno) { | |
42 int j; | |
43 | |
44 for (j = 0; j < len && line[j] != '\n'; j++) | |
45 ; | |
46 fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line); | |
47 } | |
48 | |
49 void sam_hdr_dump(SAM_hdr *hdr) { | |
50 khint_t k; | |
51 int i; | |
52 | |
53 printf("===DUMP===\n"); | |
54 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { | |
55 SAM_hdr_type *t1, *t2; | |
56 char c[2]; | |
57 | |
58 if (!kh_exist(hdr->h, k)) | |
59 continue; | |
60 | |
61 t1 = t2 = kh_val(hdr->h, k); | |
62 c[0] = kh_key(hdr->h, k)>>8; | |
63 c[1] = kh_key(hdr->h, k)&0xff; | |
64 printf("Type %.2s, count %d\n", c, t1->prev->order+1); | |
65 | |
66 do { | |
67 SAM_hdr_tag *tag; | |
68 printf(">>>%d ", t1->order); | |
69 for (tag = t1->tag; tag; tag=tag->next) { | |
70 printf("\"%.2s\":\"%.*s\"\t", | |
71 tag->str, tag->len-3, tag->str+3); | |
72 } | |
73 putchar('\n'); | |
74 t1 = t1->next; | |
75 } while (t1 != t2); | |
76 } | |
77 | |
78 /* Dump out PG chains */ | |
79 printf("\n@PG chains:\n"); | |
80 for (i = 0; i < hdr->npg_end; i++) { | |
81 int j; | |
82 printf(" %d:", i); | |
83 for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) { | |
84 printf("%s%d(%.*s)", | |
85 j == hdr->pg_end[i] ? " " : "->", | |
86 j, hdr->pg[j].name_len, hdr->pg[j].name); | |
87 } | |
88 printf("\n"); | |
89 } | |
90 | |
91 puts("===END DUMP==="); | |
92 } | |
93 | |
94 /* Updates the hash tables in the SAM_hdr structure. | |
95 * | |
96 * Returns 0 on success; | |
97 * -1 on failure | |
98 */ | |
99 static int sam_hdr_update_hashes(SAM_hdr *sh, | |
100 int type, | |
101 SAM_hdr_type *h_type) { | |
102 /* Add to reference hash? */ | |
103 if ((type>>8) == 'S' && (type&0xff) == 'Q') { | |
104 SAM_hdr_tag *tag; | |
105 int nref = sh->nref; | |
106 | |
107 sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref)); | |
108 if (!sh->ref) | |
109 return -1; | |
110 | |
111 tag = h_type->tag; | |
112 sh->ref[nref].name = NULL; | |
113 sh->ref[nref].len = 0; | |
114 sh->ref[nref].ty = h_type; | |
115 sh->ref[nref].tag = tag; | |
116 | |
117 while (tag) { | |
118 if (tag->str[0] == 'S' && tag->str[1] == 'N') { | |
119 if (!(sh->ref[nref].name = malloc(tag->len))) | |
120 return -1; | |
121 strncpy(sh->ref[nref].name, tag->str+3, tag->len-3); | |
122 sh->ref[nref].name[tag->len-3] = 0; | |
123 } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { | |
124 sh->ref[nref].len = atoi(tag->str+3); | |
125 } | |
126 tag = tag->next; | |
127 } | |
128 | |
129 if (sh->ref[nref].name) { | |
130 khint_t k; | |
131 int r; | |
132 k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r); | |
133 if (-1 == r) return -1; | |
134 kh_val(sh->ref_hash, k) = nref; | |
135 } | |
136 | |
137 sh->nref++; | |
138 } | |
139 | |
140 /* Add to read-group hash? */ | |
141 if ((type>>8) == 'R' && (type&0xff) == 'G') { | |
142 SAM_hdr_tag *tag; | |
143 int nrg = sh->nrg; | |
144 | |
145 sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg)); | |
146 if (!sh->rg) | |
147 return -1; | |
148 | |
149 tag = h_type->tag; | |
150 sh->rg[nrg].name = NULL; | |
151 sh->rg[nrg].name_len = 0; | |
152 sh->rg[nrg].ty = h_type; | |
153 sh->rg[nrg].tag = tag; | |
154 sh->rg[nrg].id = nrg; | |
155 | |
156 while (tag) { | |
157 if (tag->str[0] == 'I' && tag->str[1] == 'D') { | |
158 if (!(sh->rg[nrg].name = malloc(tag->len))) | |
159 return -1; | |
160 strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3); | |
161 sh->rg[nrg].name[tag->len-3] = 0; | |
162 sh->rg[nrg].name_len = strlen(sh->rg[nrg].name); | |
163 } | |
164 tag = tag->next; | |
165 } | |
166 | |
167 if (sh->rg[nrg].name) { | |
168 khint_t k; | |
169 int r; | |
170 k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r); | |
171 if (-1 == r) return -1; | |
172 kh_val(sh->rg_hash, k) = nrg; | |
173 } | |
174 | |
175 sh->nrg++; | |
176 } | |
177 | |
178 /* Add to program hash? */ | |
179 if ((type>>8) == 'P' && (type&0xff) == 'G') { | |
180 SAM_hdr_tag *tag; | |
181 int npg = sh->npg; | |
182 | |
183 sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg)); | |
184 if (!sh->pg) | |
185 return -1; | |
186 | |
187 tag = h_type->tag; | |
188 sh->pg[npg].name = NULL; | |
189 sh->pg[npg].name_len = 0; | |
190 sh->pg[npg].ty = h_type; | |
191 sh->pg[npg].tag = tag; | |
192 sh->pg[npg].id = npg; | |
193 sh->pg[npg].prev_id = -1; | |
194 | |
195 while (tag) { | |
196 if (tag->str[0] == 'I' && tag->str[1] == 'D') { | |
197 if (!(sh->pg[npg].name = malloc(tag->len))) | |
198 return -1; | |
199 strncpy(sh->pg[npg].name, tag->str+3, tag->len-3); | |
200 sh->pg[npg].name[tag->len-3] = 0; | |
201 sh->pg[npg].name_len = strlen(sh->pg[npg].name); | |
202 } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { | |
203 // Resolve later if needed | |
204 khint_t k; | |
205 char tmp = tag->str[tag->len]; tag->str[tag->len] = 0; | |
206 k = kh_get(m_s2i, sh->pg_hash, tag->str+3); | |
207 tag->str[tag->len] = tmp; | |
208 | |
209 if (k != kh_end(sh->pg_hash)) { | |
210 int p_id = kh_val(sh->pg_hash, k); | |
211 sh->pg[npg].prev_id = sh->pg[p_id].id; | |
212 | |
213 /* Unmark previous entry as a PG termination */ | |
214 if (sh->npg_end > 0 && | |
215 sh->pg_end[sh->npg_end-1] == p_id) { | |
216 sh->npg_end--; | |
217 } else { | |
218 int i; | |
219 for (i = 0; i < sh->npg_end; i++) { | |
220 if (sh->pg_end[i] == p_id) { | |
221 memmove(&sh->pg_end[i], &sh->pg_end[i+1], | |
222 (sh->npg_end-i-1)*sizeof(*sh->pg_end)); | |
223 sh->npg_end--; | |
224 } | |
225 } | |
226 } | |
227 } else { | |
228 sh->pg[npg].prev_id = -1; | |
229 } | |
230 } | |
231 tag = tag->next; | |
232 } | |
233 | |
234 if (sh->pg[npg].name) { | |
235 khint_t k; | |
236 int r; | |
237 k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r); | |
238 if (-1 == r) return -1; | |
239 kh_val(sh->pg_hash, k) = npg; | |
240 } | |
241 | |
242 /* Add to npg_end[] array. Remove later if we find a PP line */ | |
243 if (sh->npg_end >= sh->npg_end_alloc) { | |
244 sh->npg_end_alloc = sh->npg_end_alloc | |
245 ? sh->npg_end_alloc*2 | |
246 : 4; | |
247 sh->pg_end = realloc(sh->pg_end, | |
248 sh->npg_end_alloc * sizeof(int)); | |
249 if (!sh->pg_end) | |
250 return -1; | |
251 } | |
252 sh->pg_end[sh->npg_end++] = npg; | |
253 | |
254 sh->npg++; | |
255 } | |
256 | |
257 return 0; | |
258 } | |
259 | |
260 /* | |
261 * Appends a formatted line to an existing SAM header. | |
262 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with | |
263 * optional new-line. If it contains more than 1 line then multiple lines | |
264 * will be added in order. | |
265 * | |
266 * Len is the length of the text data, or 0 if unknown (in which case | |
267 * it should be null terminated). | |
268 * | |
269 * Returns 0 on success | |
270 * -1 on failure | |
271 */ | |
272 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) { | |
273 int i, lno = 1, text_offset; | |
274 char *hdr; | |
275 | |
276 if (!len) | |
277 len = strlen(lines); | |
278 | |
279 text_offset = ks_len(&sh->text); | |
280 if (EOF == kputsn(lines, len, &sh->text)) | |
281 return -1; | |
282 hdr = ks_str(&sh->text) + text_offset; | |
283 | |
284 for (i = 0; i < len; i++) { | |
285 khint32_t type; | |
286 khint_t k; | |
287 | |
288 int l_start = i, new; | |
289 SAM_hdr_type *h_type; | |
290 SAM_hdr_tag *h_tag, *last; | |
291 | |
292 if (hdr[i] != '@') { | |
293 int j; | |
294 for (j = i; j < len && hdr[j] != '\n'; j++) | |
295 ; | |
296 sam_hdr_error("Header line does not start with '@'", | |
297 &hdr[l_start], len - l_start, lno); | |
298 return -1; | |
299 } | |
300 | |
301 type = (hdr[i+1]<<8) | hdr[i+2]; | |
302 if (hdr[i+1] < 'A' || hdr[i+1] > 'z' || | |
303 hdr[i+2] < 'A' || hdr[i+2] > 'z') { | |
304 sam_hdr_error("Header line does not have a two character key", | |
305 &hdr[l_start], len - l_start, lno); | |
306 return -1; | |
307 } | |
308 | |
309 i += 3; | |
310 if (hdr[i] == '\n') | |
311 continue; | |
312 | |
313 // Add the header line type | |
314 if (!(h_type = pool_alloc(sh->type_pool))) | |
315 return -1; | |
316 if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new))) | |
317 return -1; | |
318 | |
319 // Form the ring, either with self or other lines of this type | |
320 if (!new) { | |
321 SAM_hdr_type *t = kh_val(sh->h, k), *p; | |
322 p = t->prev; | |
323 | |
324 assert(p->next = t); | |
325 p->next = h_type; | |
326 h_type->prev = p; | |
327 | |
328 t->prev = h_type; | |
329 h_type->next = t; | |
330 h_type->order = p->order+1; | |
331 } else { | |
332 kh_val(sh->h, k) = h_type; | |
333 h_type->prev = h_type->next = h_type; | |
334 h_type->order = 0; | |
335 } | |
336 | |
337 // Parse the tags on this line | |
338 last = NULL; | |
339 if ((type>>8) == 'C' && (type&0xff) == 'O') { | |
340 int j; | |
341 if (hdr[i] != '\t') { | |
342 sam_hdr_error("Missing tab", | |
343 &hdr[l_start], len - l_start, lno); | |
344 return -1; | |
345 } | |
346 | |
347 for (j = ++i; j < len && hdr[j] != '\n'; j++) | |
348 ; | |
349 | |
350 if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool))) | |
351 return -1; | |
352 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); | |
353 h_tag->len = j-i; | |
354 h_tag->next = NULL; | |
355 if (!h_tag->str) | |
356 return -1; | |
357 | |
358 i = j; | |
359 | |
360 } else { | |
361 do { | |
362 int j; | |
363 if (hdr[i] != '\t') { | |
364 sam_hdr_error("Missing tab", | |
365 &hdr[l_start], len - l_start, lno); | |
366 return -1; | |
367 } | |
368 | |
369 for (j = ++i; j < len && hdr[j] != '\n' && hdr[j] != '\t'; j++) | |
370 ; | |
371 | |
372 if (!(h_tag = pool_alloc(sh->tag_pool))) | |
373 return -1; | |
374 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); | |
375 h_tag->len = j-i; | |
376 h_tag->next = NULL; | |
377 if (!h_tag->str) | |
378 return -1; | |
379 | |
380 if (h_tag->len < 3 || h_tag->str[2] != ':') { | |
381 sam_hdr_error("Malformed key:value pair", | |
382 &hdr[l_start], len - l_start, lno); | |
383 return -1; | |
384 } | |
385 | |
386 if (last) | |
387 last->next = h_tag; | |
388 else | |
389 h_type->tag = h_tag; | |
390 | |
391 last = h_tag; | |
392 i = j; | |
393 } while (i < len && hdr[i] != '\n'); | |
394 } | |
395 | |
396 /* Update RG/SQ hashes */ | |
397 if (-1 == sam_hdr_update_hashes(sh, type, h_type)) | |
398 return -1; | |
399 } | |
400 | |
401 return 0; | |
402 } | |
403 | |
404 /* | |
405 * Adds a single line to a SAM header. | |
406 * Specify type and one or more key,value pairs, ending with the NULL key. | |
407 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). | |
408 * | |
409 * Returns index for specific entry on success (eg 2nd SQ, 4th RG) | |
410 * -1 on failure | |
411 */ | |
412 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) { | |
413 va_list args; | |
414 va_start(args, type); | |
415 return sam_hdr_vadd(sh, type, args, NULL); | |
416 } | |
417 | |
418 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) { | |
419 va_list args; | |
420 SAM_hdr_type *h_type; | |
421 SAM_hdr_tag *h_tag, *last; | |
422 int new; | |
423 khint32_t type_i = (type[0]<<8) | type[1], k; | |
424 | |
425 #if defined(HAVE_VA_COPY) | |
426 va_list ap_local; | |
427 #endif | |
428 | |
429 if (EOF == kputc_('@', &sh->text)) | |
430 return -1; | |
431 if (EOF == kputsn(type, 2, &sh->text)) | |
432 return -1; | |
433 | |
434 if (!(h_type = pool_alloc(sh->type_pool))) | |
435 return -1; | |
436 if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new))) | |
437 return -1; | |
438 kh_val(sh->h, k) = h_type; | |
439 | |
440 // Form the ring, either with self or other lines of this type | |
441 if (!new) { | |
442 SAM_hdr_type *t = kh_val(sh->h, k), *p; | |
443 p = t->prev; | |
444 | |
445 assert(p->next = t); | |
446 p->next = h_type; | |
447 h_type->prev = p; | |
448 | |
449 t->prev = h_type; | |
450 h_type->next = t; | |
451 h_type->order = p->order + 1; | |
452 } else { | |
453 h_type->prev = h_type->next = h_type; | |
454 h_type->order = 0; | |
455 } | |
456 | |
457 last = NULL; | |
458 | |
459 // Any ... varargs | |
460 va_start(args, ap); | |
461 for (;;) { | |
462 char *k, *v; | |
463 int idx; | |
464 | |
465 if (!(k = (char *)va_arg(args, char *))) | |
466 break; | |
467 v = va_arg(args, char *); | |
468 | |
469 if (EOF == kputc_('\t', &sh->text)) | |
470 return -1; | |
471 | |
472 if (!(h_tag = pool_alloc(sh->tag_pool))) | |
473 return -1; | |
474 idx = ks_len(&sh->text); | |
475 | |
476 if (EOF == kputs(k, &sh->text)) | |
477 return -1; | |
478 if (EOF == kputc_(':', &sh->text)) | |
479 return -1; | |
480 if (EOF == kputs(v, &sh->text)) | |
481 return -1; | |
482 | |
483 h_tag->len = ks_len(&sh->text) - idx; | |
484 h_tag->str = string_ndup(sh->str_pool, | |
485 ks_str(&sh->text) + idx, | |
486 h_tag->len); | |
487 h_tag->next = NULL; | |
488 if (!h_tag->str) | |
489 return -1; | |
490 | |
491 if (last) | |
492 last->next = h_tag; | |
493 else | |
494 h_type->tag = h_tag; | |
495 | |
496 last = h_tag; | |
497 } | |
498 va_end(args); | |
499 | |
500 #if defined(HAVE_VA_COPY) | |
501 va_copy(ap_local, ap); | |
502 # define ap ap_local | |
503 #endif | |
504 | |
505 // Plus the specified va_list params | |
506 for (;;) { | |
507 char *k, *v; | |
508 int idx; | |
509 | |
510 if (!(k = (char *)va_arg(ap, char *))) | |
511 break; | |
512 v = va_arg(ap, char *); | |
513 | |
514 if (EOF == kputc_('\t', &sh->text)) | |
515 return -1; | |
516 | |
517 if (!(h_tag = pool_alloc(sh->tag_pool))) | |
518 return -1; | |
519 idx = ks_len(&sh->text); | |
520 | |
521 if (EOF == kputs(k, &sh->text)) | |
522 return -1; | |
523 if (EOF == kputc_(':', &sh->text)) | |
524 return -1; | |
525 if (EOF == kputs(v, &sh->text)) | |
526 return -1; | |
527 | |
528 h_tag->len = ks_len(&sh->text) - idx; | |
529 h_tag->str = string_ndup(sh->str_pool, | |
530 ks_str(&sh->text) + idx, | |
531 h_tag->len); | |
532 h_tag->next = NULL; | |
533 if (!h_tag->str) | |
534 return -1; | |
535 | |
536 if (last) | |
537 last->next = h_tag; | |
538 else | |
539 h_type->tag = h_tag; | |
540 | |
541 last = h_tag; | |
542 } | |
543 va_end(ap); | |
544 | |
545 if (EOF == kputc('\n', &sh->text)) | |
546 return -1; | |
547 | |
548 int itype = (type[0]<<8) | type[1]; | |
549 if (-1 == sam_hdr_update_hashes(sh, itype, h_type)) | |
550 return -1; | |
551 | |
552 return h_type->order; | |
553 } | |
554 | |
555 /* | |
556 * Returns the first header item matching 'type'. If ID is non-NULL it checks | |
557 * for the tag ID: and compares against the specified ID. | |
558 * | |
559 * Returns NULL if no type/ID is found | |
560 */ | |
561 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, | |
562 char *ID_key, char *ID_value) { | |
563 SAM_hdr_type *t1, *t2; | |
564 int itype = (type[0]<<8)|(type[1]); | |
565 khint_t k; | |
566 | |
567 /* Special case for types we have prebuilt hashes on */ | |
568 if (ID_key) { | |
569 if (type[0] == 'S' && type[1] == 'Q' && | |
570 ID_key[0] == 'S' && ID_key[1] == 'N') { | |
571 k = kh_get(m_s2i, hdr->ref_hash, ID_value); | |
572 return k != kh_end(hdr->ref_hash) | |
573 ? hdr->ref[kh_val(hdr->ref_hash, k)].ty | |
574 : NULL; | |
575 } | |
576 | |
577 if (type[0] == 'R' && type[1] == 'G' && | |
578 ID_key[0] == 'I' && ID_key[1] == 'D') { | |
579 k = kh_get(m_s2i, hdr->rg_hash, ID_value); | |
580 return k != kh_end(hdr->rg_hash) | |
581 ? hdr->rg[kh_val(hdr->rg_hash, k)].ty | |
582 : NULL; | |
583 } | |
584 | |
585 if (type[0] == 'P' && type[1] == 'G' && | |
586 ID_key[0] == 'I' && ID_key[1] == 'D') { | |
587 k = kh_get(m_s2i, hdr->pg_hash, ID_value); | |
588 return k != kh_end(hdr->pg_hash) | |
589 ? hdr->pg[kh_val(hdr->pg_hash, k)].ty | |
590 : NULL; | |
591 } | |
592 } | |
593 | |
594 k = kh_get(sam_hdr, hdr->h, itype); | |
595 if (k == kh_end(hdr->h)) | |
596 return NULL; | |
597 | |
598 if (!ID_key) | |
599 return kh_val(hdr->h, k); | |
600 | |
601 t1 = t2 = kh_val(hdr->h, k); | |
602 do { | |
603 SAM_hdr_tag *tag; | |
604 for (tag = t1->tag; tag; tag = tag->next) { | |
605 if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) { | |
606 char *cp1 = tag->str+3; | |
607 char *cp2 = ID_value; | |
608 while (*cp1 && *cp1 == *cp2) | |
609 cp1++, cp2++; | |
610 if (*cp2 || *cp1) | |
611 continue; | |
612 return t1; | |
613 } | |
614 } | |
615 t1 = t1->next; | |
616 } while (t1 != t2); | |
617 | |
618 return NULL; | |
619 } | |
620 | |
621 /* | |
622 * As per SAM_hdr_type, but returns a complete line of formatted text | |
623 * for a specific head type/ID combination. If ID is NULL then it returns | |
624 * the first line of the specified type. | |
625 * | |
626 * The returned string is malloced and should be freed by the calling | |
627 * function with free(). | |
628 * | |
629 * Returns NULL if no type/ID is found. | |
630 */ | |
631 char *sam_hdr_find_line(SAM_hdr *hdr, char *type, | |
632 char *ID_key, char *ID_value) { | |
633 SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value); | |
634 kstring_t ks = KS_INITIALIZER; | |
635 SAM_hdr_tag *tag; | |
636 int r = 0; | |
637 | |
638 if (!ty) | |
639 return NULL; | |
640 | |
641 // Paste together the line from the hashed copy | |
642 r |= (kputc_('@', &ks) == EOF); | |
643 r |= (kputs(type, &ks) == EOF); | |
644 for (tag = ty->tag; tag; tag = tag->next) { | |
645 r |= (kputc_('\t', &ks) == EOF); | |
646 r |= (kputsn(tag->str, tag->len, &ks) == EOF); | |
647 } | |
648 | |
649 if (r) { | |
650 KS_FREE(&ks); | |
651 return NULL; | |
652 } | |
653 | |
654 return ks_str(&ks); | |
655 } | |
656 | |
657 | |
658 /* | |
659 * Looks for a specific key in a single sam header line. | |
660 * If prev is non-NULL it also fills this out with the previous tag, to | |
661 * permit use in key removal. *prev is set to NULL when the tag is the first | |
662 * key in the list. When a tag isn't found, prev (if non NULL) will be the last | |
663 * tag in the existing list. | |
664 * | |
665 * Returns the tag pointer on success | |
666 * NULL on failure | |
667 */ | |
668 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, | |
669 SAM_hdr_type *type, | |
670 char *key, | |
671 SAM_hdr_tag **prev) { | |
672 SAM_hdr_tag *tag, *p = NULL; | |
673 | |
674 for (tag = type->tag; tag; p = tag, tag = tag->next) { | |
675 if (tag->str[0] == key[0] && tag->str[1] == key[1]) { | |
676 if (prev) | |
677 *prev = p; | |
678 return tag; | |
679 } | |
680 } | |
681 | |
682 if (prev) | |
683 *prev = p; | |
684 | |
685 return NULL; | |
686 } | |
687 | |
688 | |
689 /* | |
690 * Adds or updates tag key,value pairs in a header line. | |
691 * Eg for adding M5 tags to @SQ lines or updating sort order for the | |
692 * @HD line (although use the sam_hdr_sort_order() function for | |
693 * HD manipulation, which is a wrapper around this funuction). | |
694 * | |
695 * Specify multiple key,value pairs ending in NULL. | |
696 * | |
697 * Returns 0 on success | |
698 * -1 on failure | |
699 */ | |
700 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) { | |
701 va_list ap; | |
702 | |
703 va_start(ap, type); | |
704 | |
705 for (;;) { | |
706 char *k, *v; | |
707 int idx; | |
708 SAM_hdr_tag *tag, *prev; | |
709 | |
710 if (!(k = (char *)va_arg(ap, char *))) | |
711 break; | |
712 v = va_arg(ap, char *); | |
713 | |
714 tag = sam_hdr_find_key(hdr, type, k, &prev); | |
715 if (!tag) { | |
716 if (!(tag = pool_alloc(hdr->tag_pool))) | |
717 return -1; | |
718 if (prev) | |
719 prev->next = tag; | |
720 else | |
721 type->tag = tag; | |
722 | |
723 tag->next = NULL; | |
724 } | |
725 | |
726 idx = ks_len(&hdr->text); | |
727 if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0) | |
728 return -1; | |
729 tag->len = ks_len(&hdr->text) - idx; | |
730 tag->str = string_ndup(hdr->str_pool, | |
731 ks_str(&hdr->text) + idx, | |
732 tag->len); | |
733 if (!tag->str) | |
734 return -1; | |
735 } | |
736 | |
737 va_end(ap); | |
738 | |
739 return 0; | |
740 } | |
741 | |
742 #define K(a) (((a)[0]<<8)|((a)[1])) | |
743 | |
744 /* | |
745 * Reconstructs the kstring from the header hash table. | |
746 * Returns 0 on success | |
747 * -1 on failure | |
748 */ | |
749 int sam_hdr_rebuild(SAM_hdr *hdr) { | |
750 /* Order: HD then others */ | |
751 kstring_t ks = KS_INITIALIZER; | |
752 khint_t k; | |
753 | |
754 | |
755 k = kh_get(sam_hdr, hdr->h, K("HD")); | |
756 if (k != kh_end(hdr->h)) { | |
757 SAM_hdr_type *ty = kh_val(hdr->h, k); | |
758 SAM_hdr_tag *tag; | |
759 if (EOF == kputs("@HD", &ks)) | |
760 return -1; | |
761 for (tag = ty->tag; tag; tag = tag->next) { | |
762 if (EOF == kputc_('\t', &ks)) | |
763 return -1; | |
764 if (EOF == kputsn_(tag->str, tag->len, &ks)) | |
765 return -1; | |
766 } | |
767 if (EOF == kputc('\n', &ks)) | |
768 return -1; | |
769 } | |
770 | |
771 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { | |
772 SAM_hdr_type *t1, *t2; | |
773 | |
774 if (!kh_exist(hdr->h, k)) | |
775 continue; | |
776 | |
777 if (kh_key(hdr->h, k) == K("HD")) | |
778 continue; | |
779 | |
780 t1 = t2 = kh_val(hdr->h, k); | |
781 do { | |
782 SAM_hdr_tag *tag; | |
783 char c[2]; | |
784 | |
785 if (EOF == kputc_('@', &ks)) | |
786 return -1; | |
787 c[0] = kh_key(hdr->h, k)>>8; | |
788 c[1] = kh_key(hdr->h, k)&0xff; | |
789 if (EOF == kputsn_(c, 2, &ks)) | |
790 return -1; | |
791 for (tag = t1->tag; tag; tag=tag->next) { | |
792 if (EOF == kputc_('\t', &ks)) | |
793 return -1; | |
794 if (EOF == kputsn_(tag->str, tag->len, &ks)) | |
795 return -1; | |
796 } | |
797 if (EOF == kputc('\n', &ks)) | |
798 return -1; | |
799 t1 = t1->next; | |
800 } while (t1 != t2); | |
801 } | |
802 | |
803 if (ks_str(&hdr->text)) | |
804 KS_FREE(&hdr->text); | |
805 | |
806 hdr->text = ks; | |
807 | |
808 return 0; | |
809 } | |
810 | |
811 | |
812 /* | |
813 * Creates an empty SAM header, ready to be populated. | |
814 * | |
815 * Returns a SAM_hdr struct on success (free with sam_hdr_free()) | |
816 * NULL on failure | |
817 */ | |
818 SAM_hdr *sam_hdr_new() { | |
819 SAM_hdr *sh = calloc(1, sizeof(*sh)); | |
820 | |
821 if (!sh) | |
822 return NULL; | |
823 | |
824 sh->h = kh_init(sam_hdr); | |
825 if (!sh->h) | |
826 goto err; | |
827 | |
828 sh->ID_cnt = 1; | |
829 sh->ref_count = 1; | |
830 | |
831 sh->nref = 0; | |
832 sh->ref = NULL; | |
833 if (!(sh->ref_hash = kh_init(m_s2i))) | |
834 goto err; | |
835 | |
836 sh->nrg = 0; | |
837 sh->rg = NULL; | |
838 if (!(sh->rg_hash = kh_init(m_s2i))) | |
839 goto err; | |
840 | |
841 sh->npg = 0; | |
842 sh->pg = NULL; | |
843 sh->npg_end = sh->npg_end_alloc = 0; | |
844 sh->pg_end = NULL; | |
845 if (!(sh->pg_hash = kh_init(m_s2i))) | |
846 goto err; | |
847 | |
848 KS_INIT(&sh->text); | |
849 | |
850 if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag)))) | |
851 goto err; | |
852 | |
853 if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type)))) | |
854 goto err; | |
855 | |
856 if (!(sh->str_pool = string_pool_create(8192))) | |
857 goto err; | |
858 | |
859 return sh; | |
860 | |
861 err: | |
862 if (sh->h) | |
863 kh_destroy(sam_hdr, sh->h); | |
864 | |
865 if (sh->tag_pool) | |
866 pool_destroy(sh->tag_pool); | |
867 | |
868 if (sh->type_pool) | |
869 pool_destroy(sh->type_pool); | |
870 | |
871 if (sh->str_pool) | |
872 string_pool_destroy(sh->str_pool); | |
873 | |
874 free(sh); | |
875 | |
876 return NULL; | |
877 } | |
878 | |
879 | |
880 /* | |
881 * Tokenises a SAM header into a hash table. | |
882 * Also extracts a few bits on specific data types, such as @RG lines. | |
883 * | |
884 * Returns a SAM_hdr struct on success (free with sam_hdr_free()) | |
885 * NULL on failure | |
886 */ | |
887 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) { | |
888 /* Make an empty SAM_hdr */ | |
889 SAM_hdr *sh; | |
890 | |
891 sh = sam_hdr_new(); | |
892 if (NULL == sh) return NULL; | |
893 | |
894 if (NULL == hdr) return sh; // empty header is permitted | |
895 | |
896 /* Parse the header, line by line */ | |
897 if (-1 == sam_hdr_add_lines(sh, hdr, len)) { | |
898 sam_hdr_free(sh); | |
899 return NULL; | |
900 } | |
901 | |
902 //sam_hdr_dump(sh); | |
903 //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL); | |
904 //sam_hdr_rebuild(sh); | |
905 //printf(">>%s<<", ks_str(sh->text)); | |
906 | |
907 //parse_references(sh); | |
908 //parse_read_groups(sh); | |
909 | |
910 sam_hdr_link_pg(sh); | |
911 //sam_hdr_dump(sh); | |
912 | |
913 return sh; | |
914 } | |
915 | |
916 /* | |
917 * Produces a duplicate copy of hdr and returns it. | |
918 * Returns NULL on failure | |
919 */ | |
920 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) { | |
921 if (-1 == sam_hdr_rebuild(hdr)) | |
922 return NULL; | |
923 | |
924 return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr)); | |
925 } | |
926 | |
927 /*! Increments a reference count on hdr. | |
928 * | |
929 * This permits multiple files to share the same header, all calling | |
930 * sam_hdr_free when done, without causing errors for other open files. | |
931 */ | |
932 void sam_hdr_incr_ref(SAM_hdr *hdr) { | |
933 hdr->ref_count++; | |
934 } | |
935 | |
936 /*! Increments a reference count on hdr. | |
937 * | |
938 * This permits multiple files to share the same header, all calling | |
939 * sam_hdr_free when done, without causing errors for other open files. | |
940 * | |
941 * If the reference count hits zero then the header is automatically | |
942 * freed. This makes it a synonym for sam_hdr_free(). | |
943 */ | |
944 void sam_hdr_decr_ref(SAM_hdr *hdr) { | |
945 sam_hdr_free(hdr); | |
946 } | |
947 | |
948 /*! Deallocates all storage used by a SAM_hdr struct. | |
949 * | |
950 * This also decrements the header reference count. If after decrementing | |
951 * it is still non-zero then the header is assumed to be in use by another | |
952 * caller and the free is not done. | |
953 * | |
954 * This is a synonym for sam_hdr_dec_ref(). | |
955 */ | |
956 void sam_hdr_free(SAM_hdr *hdr) { | |
957 if (!hdr) | |
958 return; | |
959 | |
960 if (--hdr->ref_count > 0) | |
961 return; | |
962 | |
963 if (ks_str(&hdr->text)) | |
964 KS_FREE(&hdr->text); | |
965 | |
966 if (hdr->h) | |
967 kh_destroy(sam_hdr, hdr->h); | |
968 | |
969 if (hdr->ref_hash) | |
970 kh_destroy(m_s2i, hdr->ref_hash); | |
971 | |
972 if (hdr->ref) { | |
973 int i; | |
974 for (i = 0; i < hdr->nref; i++) | |
975 if (hdr->ref[i].name) | |
976 free(hdr->ref[i].name); | |
977 free(hdr->ref); | |
978 } | |
979 | |
980 if (hdr->rg_hash) | |
981 kh_destroy(m_s2i, hdr->rg_hash); | |
982 | |
983 if (hdr->rg) { | |
984 int i; | |
985 for (i = 0; i < hdr->nrg; i++) | |
986 if (hdr->rg[i].name) | |
987 free(hdr->rg[i].name); | |
988 free(hdr->rg); | |
989 } | |
990 | |
991 if (hdr->pg_hash) | |
992 kh_destroy(m_s2i, hdr->pg_hash); | |
993 | |
994 if (hdr->pg) { | |
995 int i; | |
996 for (i = 0; i < hdr->npg; i++) | |
997 if (hdr->pg[i].name) | |
998 free(hdr->pg[i].name); | |
999 free(hdr->pg); | |
1000 } | |
1001 | |
1002 if (hdr->pg_end) | |
1003 free(hdr->pg_end); | |
1004 | |
1005 if (hdr->type_pool) | |
1006 pool_destroy(hdr->type_pool); | |
1007 | |
1008 if (hdr->tag_pool) | |
1009 pool_destroy(hdr->tag_pool); | |
1010 | |
1011 if (hdr->str_pool) | |
1012 string_pool_destroy(hdr->str_pool); | |
1013 | |
1014 free(hdr); | |
1015 } | |
1016 | |
1017 int sam_hdr_length(SAM_hdr *hdr) { | |
1018 return ks_len(&hdr->text); | |
1019 } | |
1020 | |
1021 char *sam_hdr_str(SAM_hdr *hdr) { | |
1022 return ks_str(&hdr->text); | |
1023 } | |
1024 | |
1025 /* | |
1026 * Looks up a reference sequence by name and returns the numerical ID. | |
1027 * Returns -1 if unknown reference. | |
1028 */ | |
1029 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) { | |
1030 khint_t k = kh_get(m_s2i, hdr->ref_hash, ref); | |
1031 return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k); | |
1032 } | |
1033 | |
1034 /* | |
1035 * Looks up a read-group by name and returns a pointer to the start of the | |
1036 * associated tag list. | |
1037 * | |
1038 * Returns NULL on failure | |
1039 */ | |
1040 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) { | |
1041 khint_t k = kh_get(m_s2i, hdr->rg_hash, rg); | |
1042 return k == kh_end(hdr->rg_hash) | |
1043 ? NULL | |
1044 : &hdr->rg[kh_val(hdr->rg_hash, k)]; | |
1045 } | |
1046 | |
1047 | |
1048 /* | |
1049 * Fixes any PP links in @PG headers. | |
1050 * If the entries are in order then this doesn't need doing, but incase | |
1051 * our header is out of order this goes through the sh->pg[] array | |
1052 * setting the prev_id field. | |
1053 * | |
1054 * Note we can have multiple complete chains. This code should identify the | |
1055 * tails of these chains as these are the entries we have to link to in | |
1056 * subsequent PP records. | |
1057 * | |
1058 * Returns 0 on sucess | |
1059 * -1 on failure (indicating broken PG/PP records) | |
1060 */ | |
1061 int sam_hdr_link_pg(SAM_hdr *hdr) { | |
1062 int i, j, ret = 0; | |
1063 | |
1064 hdr->npg_end_alloc = hdr->npg; | |
1065 hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end)); | |
1066 if (!hdr->pg_end) | |
1067 return -1; | |
1068 | |
1069 for (i = 0; i < hdr->npg; i++) | |
1070 hdr->pg_end[i] = i; | |
1071 | |
1072 for (i = 0; i < hdr->npg; i++) { | |
1073 khint_t k; | |
1074 SAM_hdr_tag *tag; | |
1075 char tmp; | |
1076 | |
1077 for (tag = hdr->pg[i].tag; tag; tag = tag->next) { | |
1078 if (tag->str[0] == 'P' && tag->str[1] == 'P') | |
1079 break; | |
1080 } | |
1081 if (!tag) { | |
1082 /* Chain start points */ | |
1083 continue; | |
1084 } | |
1085 | |
1086 tmp = tag->str[tag->len]; tag->str[tag->len] = 0; | |
1087 k = kh_get(m_s2i, hdr->pg_hash, tag->str+3); | |
1088 tag->str[tag->len] = tmp; | |
1089 | |
1090 if (k == kh_end(hdr->pg_hash)) { | |
1091 ret = -1; | |
1092 continue; | |
1093 } | |
1094 | |
1095 hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id; | |
1096 hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1; | |
1097 } | |
1098 | |
1099 for (i = j = 0; i < hdr->npg; i++) { | |
1100 if (hdr->pg_end[i] != -1) | |
1101 hdr->pg_end[j++] = hdr->pg_end[i]; | |
1102 } | |
1103 hdr->npg_end = j; | |
1104 | |
1105 return ret; | |
1106 } | |
1107 | |
1108 /* | |
1109 * Returns a unique ID from a base name. | |
1110 * | |
1111 * The value returned is valid until the next call to | |
1112 * this function. | |
1113 */ | |
1114 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) { | |
1115 khint_t k = kh_get(m_s2i, sh->pg_hash, name); | |
1116 if (k == kh_end(sh->pg_hash)) | |
1117 return name; | |
1118 | |
1119 do { | |
1120 sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++); | |
1121 k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf); | |
1122 } while (k == kh_end(sh->pg_hash)); | |
1123 | |
1124 return sh->ID_buf; | |
1125 } | |
1126 | |
1127 /* | |
1128 * Add an @PG line. | |
1129 * | |
1130 * If we wish complete control over this use sam_hdr_add() directly. This | |
1131 * function uses that, but attempts to do a lot of tedious house work for | |
1132 * you too. | |
1133 * | |
1134 * - It will generate a suitable ID if the supplied one clashes. | |
1135 * - It will generate multiple @PG records if we have multiple PG chains. | |
1136 * | |
1137 * Call it as per sam_hdr_add() with a series of key,value pairs ending | |
1138 * in NULL. | |
1139 * | |
1140 * Returns 0 on success | |
1141 * -1 on failure | |
1142 */ | |
1143 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) { | |
1144 va_list args; | |
1145 va_start(args, name); | |
1146 | |
1147 if (sh->npg_end) { | |
1148 /* Copy ends array to avoid us looping while modifying it */ | |
1149 int *end = malloc(sh->npg_end * sizeof(int)); | |
1150 int i, nends = sh->npg_end; | |
1151 | |
1152 if (!end) | |
1153 return -1; | |
1154 | |
1155 memcpy(end, sh->pg_end, nends * sizeof(*end)); | |
1156 | |
1157 for (i = 0; i < nends; i++) { | |
1158 if (-1 == sam_hdr_vadd(sh, "PG", args, | |
1159 "ID", sam_hdr_PG_ID(sh, name), | |
1160 "PN", name, | |
1161 "PP", sh->pg[end[i]].name, | |
1162 NULL)) { | |
1163 free(end); | |
1164 return -1; | |
1165 } | |
1166 } | |
1167 | |
1168 free(end); | |
1169 } else { | |
1170 if (-1 == sam_hdr_vadd(sh, "PG", args, | |
1171 "ID", sam_hdr_PG_ID(sh, name), | |
1172 "PN", name, | |
1173 NULL)) | |
1174 return -1; | |
1175 } | |
1176 | |
1177 //sam_hdr_dump(sh); | |
1178 | |
1179 return 0; | |
1180 } | |
1181 | |
1182 /* | |
1183 * A function to help with construction of CL tags in @PG records. | |
1184 * Takes an argc, argv pair and returns a single space-separated string. | |
1185 * This string should be deallocated by the calling function. | |
1186 * | |
1187 * Returns malloced char * on success | |
1188 * NULL on failure | |
1189 */ | |
1190 char *stringify_argv(int argc, char *argv[]) { | |
1191 char *str, *cp; | |
1192 size_t nbytes = 1; | |
1193 int i, j; | |
1194 | |
1195 /* Allocate */ | |
1196 for (i = 0; i < argc; i++) { | |
1197 nbytes += strlen(argv[i]) + 1; | |
1198 } | |
1199 if (!(str = malloc(nbytes))) | |
1200 return NULL; | |
1201 | |
1202 /* Copy */ | |
1203 cp = str; | |
1204 for (i = 0; i < argc; i++) { | |
1205 j = 0; | |
1206 while (argv[i][j]) { | |
1207 if (argv[i][j] == '\t') | |
1208 *cp++ = ' '; | |
1209 else | |
1210 *cp++ = argv[i][j]; | |
1211 j++; | |
1212 } | |
1213 *cp++ = ' '; | |
1214 } | |
1215 *cp++ = 0; | |
1216 | |
1217 return str; | |
1218 } |