diff ezBAMQC/src/htslib/cram/sam_header.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ezBAMQC/src/htslib/cram/sam_header.c	Thu Mar 24 17:12:52 2016 -0400
@@ -0,0 +1,1218 @@
+/*
+Copyright (c) 2013 Genome Research Ltd.
+Author: James Bonfield <jkb@sanger.ac.uk>
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+Institute nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "io_lib_config.h"
+#endif
+
+#include <string.h>
+#include <assert.h>
+
+#include "cram/sam_header.h"
+#include "cram/string_alloc.h"
+
+static void sam_hdr_error(char *msg, char *line, int len, int lno) {
+    int j;
+    
+    for (j = 0; j < len && line[j] != '\n'; j++)
+	;
+    fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
+}
+
+void sam_hdr_dump(SAM_hdr *hdr) {
+    khint_t k;
+    int i;
+
+    printf("===DUMP===\n");
+    for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
+	SAM_hdr_type *t1, *t2;
+	char c[2];
+
+	if (!kh_exist(hdr->h, k))
+	    continue;
+
+	t1 = t2 = kh_val(hdr->h, k);
+	c[0] = kh_key(hdr->h, k)>>8;
+	c[1] = kh_key(hdr->h, k)&0xff;
+	printf("Type %.2s, count %d\n", c, t1->prev->order+1);
+
+	do {
+	    SAM_hdr_tag *tag;
+	    printf(">>>%d ", t1->order);
+	    for (tag = t1->tag; tag; tag=tag->next) {
+		printf("\"%.2s\":\"%.*s\"\t",
+		       tag->str, tag->len-3, tag->str+3);
+	    }
+	    putchar('\n');
+	    t1 = t1->next;
+	} while (t1 != t2);
+    }
+
+    /* Dump out PG chains */
+    printf("\n@PG chains:\n");
+    for (i = 0; i < hdr->npg_end; i++) {
+	int j;
+	printf("  %d:", i);
+	for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
+	    printf("%s%d(%.*s)", 
+		   j == hdr->pg_end[i] ? " " : "->",
+		   j, hdr->pg[j].name_len, hdr->pg[j].name);
+	}
+	printf("\n");
+    }
+
+    puts("===END DUMP===");
+}
+
+/* Updates the hash tables in the SAM_hdr structure.
+ *
+ * Returns 0 on success;
+ *        -1 on failure
+ */
+static int sam_hdr_update_hashes(SAM_hdr *sh,
+				 int type,
+				 SAM_hdr_type *h_type) {
+    /* Add to reference hash? */
+    if ((type>>8) == 'S' && (type&0xff) == 'Q') {
+	SAM_hdr_tag *tag;
+	int nref = sh->nref;
+
+	sh->ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
+	if (!sh->ref)
+	    return -1;
+
+	tag = h_type->tag;
+	sh->ref[nref].name = NULL;
+	sh->ref[nref].len  = 0;
+	sh->ref[nref].ty = h_type;
+	sh->ref[nref].tag  = tag;
+
+	while (tag) {
+	    if (tag->str[0] == 'S' && tag->str[1] == 'N') {
+		if (!(sh->ref[nref].name = malloc(tag->len)))
+		    return -1;
+		strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
+		sh->ref[nref].name[tag->len-3] = 0;
+	    } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
+		sh->ref[nref].len = atoi(tag->str+3);
+	    }
+	    tag = tag->next;
+	}
+
+	if (sh->ref[nref].name) {
+	    khint_t k;
+	    int r;
+	    k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
+	    if (-1 == r) return -1;
+	    kh_val(sh->ref_hash, k) = nref;
+	}
+
+	sh->nref++;
+    }
+
+    /* Add to read-group hash? */
+    if ((type>>8) == 'R' && (type&0xff) == 'G') {
+	SAM_hdr_tag *tag;
+	int nrg = sh->nrg;
+
+	sh->rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
+	if (!sh->rg)
+	    return -1;
+
+	tag = h_type->tag;
+	sh->rg[nrg].name = NULL;
+	sh->rg[nrg].name_len = 0;
+	sh->rg[nrg].ty   = h_type;
+	sh->rg[nrg].tag  = tag;
+	sh->rg[nrg].id   = nrg;
+
+	while (tag) {
+	    if (tag->str[0] == 'I' && tag->str[1] == 'D') {
+		if (!(sh->rg[nrg].name = malloc(tag->len)))
+		    return -1;
+		strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
+		sh->rg[nrg].name[tag->len-3] = 0;
+		sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
+	    }
+	    tag = tag->next;
+	}
+
+	if (sh->rg[nrg].name) {
+	    khint_t k;
+	    int r;
+	    k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
+	    if (-1 == r) return -1;
+	    kh_val(sh->rg_hash, k) = nrg;
+	}
+
+	sh->nrg++;
+    }
+
+    /* Add to program hash? */
+    if ((type>>8) == 'P' && (type&0xff) == 'G') {
+	SAM_hdr_tag *tag;
+	int npg = sh->npg;
+
+	sh->pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
+	if (!sh->pg)
+	    return -1;
+
+	tag = h_type->tag;
+	sh->pg[npg].name = NULL;
+	sh->pg[npg].name_len = 0;
+	sh->pg[npg].ty  = h_type;
+	sh->pg[npg].tag  = tag;
+	sh->pg[npg].id   = npg;
+	sh->pg[npg].prev_id = -1;
+
+	while (tag) {
+	    if (tag->str[0] == 'I' && tag->str[1] == 'D') {
+		if (!(sh->pg[npg].name = malloc(tag->len)))
+		    return -1;
+		strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
+		sh->pg[npg].name[tag->len-3] = 0;
+		sh->pg[npg].name_len = strlen(sh->pg[npg].name);
+	    } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
+		// Resolve later if needed
+		khint_t k;
+		char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
+		k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
+		tag->str[tag->len] = tmp;
+
+		if (k != kh_end(sh->pg_hash)) {
+		    int p_id = kh_val(sh->pg_hash, k);
+		    sh->pg[npg].prev_id = sh->pg[p_id].id;
+
+		    /* Unmark previous entry as a PG termination */
+		    if (sh->npg_end > 0 &&
+			sh->pg_end[sh->npg_end-1] == p_id) {
+			sh->npg_end--;
+		    } else {
+			int i;
+			for (i = 0; i < sh->npg_end; i++) {
+			    if (sh->pg_end[i] == p_id) {
+				memmove(&sh->pg_end[i], &sh->pg_end[i+1],
+					(sh->npg_end-i-1)*sizeof(*sh->pg_end));
+				sh->npg_end--;
+			    }
+			}
+		    }
+		} else {
+		    sh->pg[npg].prev_id = -1;
+		}
+	    }
+	    tag = tag->next;
+	}
+
+	if (sh->pg[npg].name) {
+	    khint_t k;
+	    int r;
+	    k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
+	    if (-1 == r) return -1;
+	    kh_val(sh->pg_hash, k) = npg;
+	}
+
+	/* Add to npg_end[] array. Remove later if we find a PP line */
+	if (sh->npg_end >= sh->npg_end_alloc) {
+	    sh->npg_end_alloc = sh->npg_end_alloc
+		? sh->npg_end_alloc*2
+		: 4;
+	    sh->pg_end = realloc(sh->pg_end,
+				 sh->npg_end_alloc * sizeof(int));
+	    if (!sh->pg_end)
+		return -1;
+	}
+	sh->pg_end[sh->npg_end++] = npg;
+
+	sh->npg++;
+    }
+
+    return 0;
+}
+
+/*
+ * Appends a formatted line to an existing SAM header.
+ * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
+ * optional new-line. If it contains more than 1 line then multiple lines
+ * will be added in order.
+ *
+ * Len is the length of the text data, or 0 if unknown (in which case
+ * it should be null terminated).
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
+    int i, lno = 1, text_offset;
+    char *hdr;
+
+    if (!len)
+	len = strlen(lines);
+
+    text_offset = ks_len(&sh->text);
+    if (EOF == kputsn(lines, len, &sh->text))
+	return -1;
+    hdr = ks_str(&sh->text) + text_offset;
+
+    for (i = 0; i < len; i++) {
+	khint32_t type;
+	khint_t k;
+
+	int l_start = i, new;
+	SAM_hdr_type *h_type;
+	SAM_hdr_tag *h_tag, *last;
+
+	if (hdr[i] != '@') {
+	    int j;
+	    for (j = i; j < len && hdr[j] != '\n'; j++)
+		;
+	    sam_hdr_error("Header line does not start with '@'",
+			  &hdr[l_start], len - l_start, lno);
+	    return -1;
+	}
+
+	type = (hdr[i+1]<<8) | hdr[i+2];
+	if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
+	    hdr[i+2] < 'A' || hdr[i+2] > 'z') {
+	    sam_hdr_error("Header line does not have a two character key",
+			  &hdr[l_start], len - l_start, lno);
+	    return -1;
+	}
+
+	i += 3;
+	if (hdr[i] == '\n')
+	    continue;
+
+	// Add the header line type
+	if (!(h_type = pool_alloc(sh->type_pool)))
+	    return -1;
+	if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
+	    return -1;
+
+	// Form the ring, either with self or other lines of this type
+	if (!new) {
+	    SAM_hdr_type *t = kh_val(sh->h, k), *p;
+	    p = t->prev;
+	    
+	    assert(p->next = t);
+	    p->next = h_type;
+	    h_type->prev = p;
+
+	    t->prev = h_type;
+	    h_type->next = t;
+	    h_type->order = p->order+1;
+	} else {
+	    kh_val(sh->h, k) = h_type;
+	    h_type->prev = h_type->next = h_type;
+	    h_type->order = 0;
+	}
+
+	// Parse the tags on this line
+	last = NULL;
+	if ((type>>8) == 'C' && (type&0xff) == 'O') {
+	    int j;
+	    if (hdr[i] != '\t') {
+		sam_hdr_error("Missing tab",
+			      &hdr[l_start], len - l_start, lno);
+		return -1;
+	    }
+
+	    for (j = ++i; j < len && hdr[j] != '\n'; j++)
+		;
+
+	    if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
+		return -1;
+	    h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
+	    h_tag->len = j-i;
+	    h_tag->next = NULL;
+	    if (!h_tag->str)
+		return -1;
+
+	    i = j;
+
+	} else {
+	    do {
+		int j;
+		if (hdr[i] != '\t') {
+		    sam_hdr_error("Missing tab",
+				  &hdr[l_start], len - l_start, lno);
+		    return -1;
+		}
+
+		for (j = ++i; j < len && hdr[j] != '\n' && hdr[j] != '\t'; j++)
+		    ;
+	    
+		if (!(h_tag = pool_alloc(sh->tag_pool)))
+		    return -1;
+		h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
+		h_tag->len = j-i;
+		h_tag->next = NULL;
+		if (!h_tag->str)
+		    return -1;
+
+		if (h_tag->len < 3 || h_tag->str[2] != ':') {
+		    sam_hdr_error("Malformed key:value pair",
+				  &hdr[l_start], len - l_start, lno);
+		    return -1;
+		}
+	    
+		if (last)
+		    last->next = h_tag;
+		else
+		    h_type->tag = h_tag;
+
+		last = h_tag;
+		i = j;
+	    } while (i < len && hdr[i] != '\n');
+	}
+
+	/* Update RG/SQ hashes */
+	if (-1 == sam_hdr_update_hashes(sh, type, h_type))
+	    return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Adds a single line to a SAM header.
+ * Specify type and one or more key,value pairs, ending with the NULL key.
+ * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
+ *
+ * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
+ *        -1 on failure
+ */
+int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
+    va_list args;
+    va_start(args, type);
+    return sam_hdr_vadd(sh, type, args, NULL);
+}
+
+int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
+    va_list args;
+    SAM_hdr_type *h_type;
+    SAM_hdr_tag *h_tag, *last;
+    int new;
+    khint32_t type_i = (type[0]<<8) | type[1], k;
+
+#if defined(HAVE_VA_COPY)
+    va_list ap_local;
+#endif
+
+    if (EOF == kputc_('@', &sh->text))
+	return -1;
+    if (EOF == kputsn(type, 2, &sh->text))
+	return -1;
+
+    if (!(h_type = pool_alloc(sh->type_pool)))
+	return -1;
+    if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
+	return -1;
+    kh_val(sh->h, k) = h_type;
+
+    // Form the ring, either with self or other lines of this type
+    if (!new) {
+	SAM_hdr_type *t = kh_val(sh->h, k), *p;
+	p = t->prev;
+	    
+	assert(p->next = t);
+	p->next = h_type;
+	h_type->prev = p;
+
+	t->prev = h_type;
+	h_type->next = t;
+	h_type->order = p->order + 1;
+    } else {
+	h_type->prev = h_type->next = h_type;
+	h_type->order = 0;
+    }
+
+    last = NULL;
+
+    // Any ... varargs
+    va_start(args, ap);
+    for (;;) {
+	char *k, *v;
+	int idx;
+	
+	if (!(k = (char *)va_arg(args, char *)))
+	    break;
+	v = va_arg(args, char *);
+
+	if (EOF == kputc_('\t', &sh->text))
+	    return -1;
+
+	if (!(h_tag = pool_alloc(sh->tag_pool)))
+	    return -1;
+	idx = ks_len(&sh->text);
+	
+	if (EOF == kputs(k, &sh->text))
+	    return -1;
+	if (EOF == kputc_(':', &sh->text))
+	    return -1;
+	if (EOF == kputs(v, &sh->text))
+	    return -1;
+
+	h_tag->len = ks_len(&sh->text) - idx;
+	h_tag->str = string_ndup(sh->str_pool,
+				 ks_str(&sh->text) + idx,
+				 h_tag->len);
+	h_tag->next = NULL;
+	if (!h_tag->str)
+	    return -1;
+
+	if (last)
+	    last->next = h_tag;
+	else
+	    h_type->tag = h_tag;
+	
+	last = h_tag;
+    }
+    va_end(args);
+
+#if defined(HAVE_VA_COPY)
+    va_copy(ap_local, ap);
+#   define ap ap_local
+#endif
+
+    // Plus the specified va_list params
+    for (;;) {
+	char *k, *v;
+	int idx;
+	
+	if (!(k = (char *)va_arg(ap, char *)))
+	    break;
+	v = va_arg(ap, char *);
+
+	if (EOF == kputc_('\t', &sh->text))
+	    return -1;
+
+	if (!(h_tag = pool_alloc(sh->tag_pool)))
+	    return -1;
+	idx = ks_len(&sh->text);
+	
+	if (EOF == kputs(k, &sh->text))
+	    return -1;
+	if (EOF == kputc_(':', &sh->text))
+	    return -1;
+	if (EOF == kputs(v, &sh->text))
+	    return -1;
+
+	h_tag->len = ks_len(&sh->text) - idx;
+	h_tag->str = string_ndup(sh->str_pool,
+				 ks_str(&sh->text) + idx,
+				 h_tag->len);
+	h_tag->next = NULL;
+	if (!h_tag->str)
+	    return -1;
+
+	if (last)
+	    last->next = h_tag;
+	else
+	    h_type->tag = h_tag;
+	
+	last = h_tag;
+    }
+    va_end(ap);
+
+    if (EOF == kputc('\n', &sh->text))
+	return -1;
+
+    int itype = (type[0]<<8) | type[1];
+    if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
+	return -1;
+
+    return h_type->order;
+}
+
+/*
+ * Returns the first header item matching 'type'. If ID is non-NULL it checks
+ * for the tag ID: and compares against the specified ID.
+ *
+ * Returns NULL if no type/ID is found
+ */
+SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
+			   char *ID_key, char *ID_value) {
+    SAM_hdr_type *t1, *t2;
+    int itype = (type[0]<<8)|(type[1]);
+    khint_t k;
+
+    /* Special case for types we have prebuilt hashes on */
+    if (ID_key) {
+	if (type[0]   == 'S' && type[1]   == 'Q' &&
+	    ID_key[0] == 'S' && ID_key[1] == 'N') {
+	    k = kh_get(m_s2i, hdr->ref_hash, ID_value);
+	    return k != kh_end(hdr->ref_hash)
+		? hdr->ref[kh_val(hdr->ref_hash, k)].ty
+		: NULL;
+	}
+
+	if (type[0]   == 'R' && type[1]   == 'G' &&
+	    ID_key[0] == 'I' && ID_key[1] == 'D') {
+	    k = kh_get(m_s2i, hdr->rg_hash, ID_value);
+	    return k != kh_end(hdr->rg_hash)
+		? hdr->rg[kh_val(hdr->rg_hash, k)].ty
+		: NULL;
+	}
+
+	if (type[0]   == 'P' && type[1]   == 'G' &&
+	    ID_key[0] == 'I' && ID_key[1] == 'D') {
+	    k = kh_get(m_s2i, hdr->pg_hash, ID_value);
+	    return k != kh_end(hdr->pg_hash)
+		? hdr->pg[kh_val(hdr->pg_hash, k)].ty
+		: NULL;
+	}
+    }
+
+    k = kh_get(sam_hdr, hdr->h, itype);
+    if (k == kh_end(hdr->h))
+	return NULL;
+    
+    if (!ID_key)
+	return kh_val(hdr->h, k);
+
+    t1 = t2 = kh_val(hdr->h, k);
+    do {
+	SAM_hdr_tag *tag;
+	for (tag = t1->tag; tag; tag = tag->next) {
+	    if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
+		char *cp1 = tag->str+3;
+		char *cp2 = ID_value;
+		while (*cp1 && *cp1 == *cp2)
+		    cp1++, cp2++;
+		if (*cp2 || *cp1)
+		    continue;
+		return t1;
+	    }
+	}
+	t1 = t1->next;
+    } while (t1 != t2);
+
+    return NULL;
+}
+
+/*
+ * As per SAM_hdr_type, but returns a complete line of formatted text
+ * for a specific head type/ID combination. If ID is NULL then it returns
+ * the first line of the specified type.
+ *
+ * The returned string is malloced and should be freed by the calling
+ * function with free().
+ *
+ * Returns NULL if no type/ID is found.
+ */
+char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
+			char *ID_key, char *ID_value) {
+    SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
+    kstring_t ks = KS_INITIALIZER;
+    SAM_hdr_tag *tag;
+    int r = 0;
+
+    if (!ty)
+	return NULL;
+
+    // Paste together the line from the hashed copy
+    r |= (kputc_('@', &ks) == EOF);
+    r |= (kputs(type, &ks) == EOF);
+    for (tag = ty->tag; tag; tag = tag->next) {
+	r |= (kputc_('\t', &ks) == EOF);
+	r |= (kputsn(tag->str, tag->len, &ks) == EOF);
+    }
+
+    if (r) {
+	KS_FREE(&ks);
+	return NULL;
+    }
+
+    return ks_str(&ks);
+}
+
+
+/*
+ * Looks for a specific key in a single sam header line.
+ * If prev is non-NULL it also fills this out with the previous tag, to
+ * permit use in key removal. *prev is set to NULL when the tag is the first
+ * key in the list. When a tag isn't found, prev (if non NULL) will be the last
+ * tag in the existing list.
+ *
+ * Returns the tag pointer on success
+ *         NULL on failure
+ */
+SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
+			      SAM_hdr_type *type,
+			      char *key,
+			      SAM_hdr_tag **prev) {
+    SAM_hdr_tag *tag, *p = NULL;
+
+    for (tag = type->tag; tag; p = tag, tag = tag->next) {
+	if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
+	    if (prev)
+		*prev = p;
+	    return tag;
+	}
+    }
+
+    if (prev)
+	*prev = p;
+
+    return NULL;
+}
+
+
+/*
+ * Adds or updates tag key,value pairs in a header line.
+ * Eg for adding M5 tags to @SQ lines or updating sort order for the
+ * @HD line (although use the sam_hdr_sort_order() function for
+ * HD manipulation, which is a wrapper around this funuction).
+ *
+ * Specify multiple key,value pairs ending in NULL.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
+    va_list ap;
+
+    va_start(ap, type);
+    
+    for (;;) {
+	char *k, *v;
+	int idx;
+	SAM_hdr_tag *tag, *prev;
+
+	if (!(k = (char *)va_arg(ap, char *)))
+	    break;
+	v = va_arg(ap, char *);
+
+	tag = sam_hdr_find_key(hdr, type, k, &prev);
+	if (!tag) {
+	    if (!(tag = pool_alloc(hdr->tag_pool)))
+		return -1;
+	    if (prev)
+		prev->next = tag;
+	    else
+		type->tag = tag;
+
+	    tag->next = NULL;
+	}
+
+	idx = ks_len(&hdr->text);
+	if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
+	    return -1;
+	tag->len = ks_len(&hdr->text) - idx;
+	tag->str = string_ndup(hdr->str_pool,
+			       ks_str(&hdr->text) + idx,
+			       tag->len);
+	if (!tag->str)
+	    return -1;
+    }
+
+    va_end(ap);
+
+    return 0;
+}
+
+#define K(a) (((a)[0]<<8)|((a)[1]))
+
+/*
+ * Reconstructs the kstring from the header hash table.
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int sam_hdr_rebuild(SAM_hdr *hdr) {
+    /* Order: HD then others */
+    kstring_t ks = KS_INITIALIZER;
+    khint_t k;
+
+
+    k = kh_get(sam_hdr, hdr->h, K("HD"));
+    if (k != kh_end(hdr->h)) {
+	SAM_hdr_type *ty = kh_val(hdr->h, k);
+	SAM_hdr_tag *tag;
+	if (EOF == kputs("@HD", &ks))
+	    return -1;
+	for (tag = ty->tag; tag; tag = tag->next) {
+	    if (EOF == kputc_('\t', &ks))
+		return -1;
+	    if (EOF == kputsn_(tag->str, tag->len, &ks))
+		return -1;
+	}
+	if (EOF == kputc('\n', &ks))
+	    return -1;
+    }
+
+    for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
+	SAM_hdr_type *t1, *t2;
+
+	if (!kh_exist(hdr->h, k))
+	    continue;
+
+	if (kh_key(hdr->h, k) == K("HD"))
+	    continue;
+
+	t1 = t2 = kh_val(hdr->h, k);
+	do {
+	    SAM_hdr_tag *tag;
+	    char c[2];
+
+	    if (EOF == kputc_('@', &ks))
+		return -1;
+	    c[0] = kh_key(hdr->h, k)>>8;
+	    c[1] = kh_key(hdr->h, k)&0xff;
+	    if (EOF == kputsn_(c, 2, &ks))
+		return -1;
+	    for (tag = t1->tag; tag; tag=tag->next) {
+		if (EOF == kputc_('\t', &ks))
+		    return -1;
+		if (EOF == kputsn_(tag->str, tag->len, &ks))
+		    return -1;
+	    }
+	    if (EOF == kputc('\n', &ks))
+		return -1;
+	    t1 = t1->next;
+	} while (t1 != t2);
+    }
+
+    if (ks_str(&hdr->text))
+	KS_FREE(&hdr->text);
+
+    hdr->text = ks;
+
+    return 0;
+}
+
+
+/*
+ * Creates an empty SAM header, ready to be populated.
+ * 
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ *         NULL on failure
+ */
+SAM_hdr *sam_hdr_new() {
+    SAM_hdr *sh = calloc(1, sizeof(*sh));
+
+    if (!sh)
+	return NULL;
+    
+    sh->h = kh_init(sam_hdr);
+    if (!sh->h)
+	goto err;
+
+    sh->ID_cnt = 1;
+    sh->ref_count = 1;
+
+    sh->nref = 0;
+    sh->ref  = NULL;
+    if (!(sh->ref_hash = kh_init(m_s2i)))
+	goto err;
+
+    sh->nrg = 0;
+    sh->rg  = NULL;
+    if (!(sh->rg_hash = kh_init(m_s2i)))
+	goto err;
+
+    sh->npg = 0;
+    sh->pg  = NULL;
+    sh->npg_end = sh->npg_end_alloc = 0;
+    sh->pg_end = NULL;
+    if (!(sh->pg_hash = kh_init(m_s2i)))
+	goto err;
+
+    KS_INIT(&sh->text);
+
+    if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
+	goto err;
+
+    if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
+	goto err;
+
+    if (!(sh->str_pool = string_pool_create(8192)))
+	goto err;
+
+    return sh;
+
+ err:
+    if (sh->h)
+	kh_destroy(sam_hdr, sh->h);
+
+    if (sh->tag_pool)
+	pool_destroy(sh->tag_pool);
+
+    if (sh->type_pool)
+	pool_destroy(sh->type_pool);
+
+    if (sh->str_pool)
+	string_pool_destroy(sh->str_pool);
+
+    free(sh);
+
+    return NULL;
+}
+
+
+/*
+ * Tokenises a SAM header into a hash table.
+ * Also extracts a few bits on specific data types, such as @RG lines.
+ *
+ * Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ *         NULL on failure
+ */
+SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
+    /* Make an empty SAM_hdr */
+    SAM_hdr *sh;
+    
+    sh = sam_hdr_new();
+    if (NULL == sh) return NULL;
+
+    if (NULL == hdr) return sh; // empty header is permitted
+
+    /* Parse the header, line by line */
+    if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
+	sam_hdr_free(sh);
+	return NULL;
+    }
+
+    //sam_hdr_dump(sh);
+    //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
+    //sam_hdr_rebuild(sh);
+    //printf(">>%s<<", ks_str(sh->text));
+
+    //parse_references(sh);
+    //parse_read_groups(sh);
+
+    sam_hdr_link_pg(sh);
+    //sam_hdr_dump(sh);
+
+    return sh;
+}
+
+/*
+ * Produces a duplicate copy of hdr and returns it.
+ * Returns NULL on failure
+ */
+SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
+    if (-1 == sam_hdr_rebuild(hdr))
+	return NULL;
+
+    return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
+}
+
+/*! Increments a reference count on hdr.
+ *
+ * This permits multiple files to share the same header, all calling
+ * sam_hdr_free when done, without causing errors for other open  files.
+ */
+void sam_hdr_incr_ref(SAM_hdr *hdr) {
+    hdr->ref_count++;
+}
+
+/*! Increments a reference count on hdr.
+ *
+ * This permits multiple files to share the same header, all calling
+ * sam_hdr_free when done, without causing errors for other open  files.
+ *
+ * If the reference count hits zero then the header is automatically
+ * freed. This makes it a synonym for sam_hdr_free().
+ */
+void sam_hdr_decr_ref(SAM_hdr *hdr) {
+    sam_hdr_free(hdr);
+}
+
+/*! Deallocates all storage used by a SAM_hdr struct.
+ *
+ * This also decrements the header reference count. If after decrementing 
+ * it is still non-zero then the header is assumed to be in use by another
+ * caller and the free is not done.
+ *
+ * This is a synonym for sam_hdr_dec_ref().
+ */
+void sam_hdr_free(SAM_hdr *hdr) {
+    if (!hdr)
+	return;
+
+    if (--hdr->ref_count > 0)
+	return;
+
+    if (ks_str(&hdr->text))
+	KS_FREE(&hdr->text);
+
+    if (hdr->h)
+	kh_destroy(sam_hdr, hdr->h);
+
+    if (hdr->ref_hash)
+	kh_destroy(m_s2i, hdr->ref_hash);
+
+    if (hdr->ref) {
+	int i;
+	for (i = 0; i < hdr->nref; i++)
+	    if (hdr->ref[i].name)
+		free(hdr->ref[i].name);
+	free(hdr->ref);
+    }
+
+    if (hdr->rg_hash)
+	kh_destroy(m_s2i, hdr->rg_hash);
+
+    if (hdr->rg) {
+	int i;
+	for (i = 0; i < hdr->nrg; i++)
+	    if (hdr->rg[i].name)
+		free(hdr->rg[i].name);
+	free(hdr->rg);
+    }
+
+    if (hdr->pg_hash)
+	kh_destroy(m_s2i, hdr->pg_hash);
+
+    if (hdr->pg) {
+	int i;
+	for (i = 0; i < hdr->npg; i++)
+	    if (hdr->pg[i].name)
+		free(hdr->pg[i].name);
+	free(hdr->pg);
+    }
+
+    if (hdr->pg_end)
+	free(hdr->pg_end);
+
+    if (hdr->type_pool)
+	pool_destroy(hdr->type_pool);
+
+    if (hdr->tag_pool)
+	pool_destroy(hdr->tag_pool);
+
+    if (hdr->str_pool)
+	string_pool_destroy(hdr->str_pool);
+
+    free(hdr);
+}
+
+int sam_hdr_length(SAM_hdr *hdr) {
+    return ks_len(&hdr->text);
+}
+
+char *sam_hdr_str(SAM_hdr *hdr) {
+    return ks_str(&hdr->text);
+}
+
+/*
+ * Looks up a reference sequence by name and returns the numerical ID.
+ * Returns -1 if unknown reference.
+ */
+int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
+    khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
+    return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
+}
+
+/*
+ * Looks up a read-group by name and returns a pointer to the start of the
+ * associated tag list.
+ *
+ * Returns NULL on failure
+ */
+SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
+    khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
+    return k == kh_end(hdr->rg_hash)
+	? NULL
+	: &hdr->rg[kh_val(hdr->rg_hash, k)];
+}
+
+
+/*
+ * Fixes any PP links in @PG headers.
+ * If the entries are in order then this doesn't need doing, but incase
+ * our header is out of order this goes through the sh->pg[] array
+ * setting the prev_id field.
+ *
+ * Note we can have multiple complete chains. This code should identify the
+ * tails of these chains as these are the entries we have to link to in
+ * subsequent PP records.
+ *
+ * Returns 0 on sucess
+ *        -1 on failure (indicating broken PG/PP records)
+ */
+int sam_hdr_link_pg(SAM_hdr *hdr) {
+    int i, j, ret = 0;
+
+    hdr->npg_end_alloc = hdr->npg;
+    hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
+    if (!hdr->pg_end)
+	return -1;
+
+    for (i = 0; i < hdr->npg; i++)
+	hdr->pg_end[i] = i;
+
+    for (i = 0; i < hdr->npg; i++) {
+	khint_t k;
+	SAM_hdr_tag *tag;
+	char tmp;
+
+	for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
+	    if (tag->str[0] == 'P' && tag->str[1] == 'P')
+		break;
+	}
+	if (!tag) {
+	    /* Chain start points */
+	    continue;
+	}
+
+	tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
+	k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
+	tag->str[tag->len] = tmp;
+
+	if (k == kh_end(hdr->pg_hash)) {
+	    ret = -1;
+	    continue;
+	}
+
+	hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
+	hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
+    }
+
+    for (i = j = 0; i < hdr->npg; i++) {
+	if (hdr->pg_end[i] != -1)
+	    hdr->pg_end[j++] = hdr->pg_end[i];
+    }
+    hdr->npg_end = j;
+
+    return ret;
+}
+
+/*
+ * Returns a unique ID from a base name.
+ *
+ * The value returned is valid until the next call to
+ * this function.
+ */
+const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
+    khint_t k = kh_get(m_s2i, sh->pg_hash, name);
+    if (k == kh_end(sh->pg_hash))
+	return name;
+
+    do {
+	sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
+	k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
+    } while (k == kh_end(sh->pg_hash));
+
+    return sh->ID_buf;
+}
+
+/*
+ * Add an @PG line.
+ *
+ * If we wish complete control over this use sam_hdr_add() directly. This
+ * function uses that, but attempts to do a lot of tedious house work for
+ * you too.
+ *
+ * - It will generate a suitable ID if the supplied one clashes.
+ * - It will generate multiple @PG records if we have multiple PG chains.
+ *
+ * Call it as per sam_hdr_add() with a series of key,value pairs ending
+ * in NULL.
+ *
+ * Returns 0 on success
+ *        -1 on failure
+ */
+int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
+    va_list args;
+    va_start(args, name);
+
+    if (sh->npg_end) {
+	/* Copy ends array to avoid us looping while modifying it */
+	int *end = malloc(sh->npg_end * sizeof(int));
+	int i, nends = sh->npg_end;
+
+	if (!end)
+	    return -1;
+
+	memcpy(end, sh->pg_end, nends * sizeof(*end));
+
+	for (i = 0; i < nends; i++) {
+	    if (-1 == sam_hdr_vadd(sh, "PG", args,
+				   "ID", sam_hdr_PG_ID(sh, name),
+				   "PN", name,
+				   "PP", sh->pg[end[i]].name,
+				   NULL)) {
+		free(end);
+		return  -1;
+	    }
+	}
+
+	free(end);
+    } else {
+	if (-1 == sam_hdr_vadd(sh, "PG", args,
+			       "ID", sam_hdr_PG_ID(sh, name),
+			       "PN", name,
+			       NULL))
+	    return -1;
+    }
+
+    //sam_hdr_dump(sh);
+
+    return 0;
+}
+
+/*
+ * A function to help with construction of CL tags in @PG records.
+ * Takes an argc, argv pair and returns a single space-separated string.
+ * This string should be deallocated by the calling function.
+ * 
+ * Returns malloced char * on success
+ *         NULL on failure
+ */
+char *stringify_argv(int argc, char *argv[]) {
+    char *str, *cp;
+    size_t nbytes = 1;
+    int i, j;
+
+    /* Allocate */
+    for (i = 0; i < argc; i++) {
+	nbytes += strlen(argv[i]) + 1;
+    }
+    if (!(str = malloc(nbytes)))
+	return NULL;
+
+    /* Copy */
+    cp = str;
+    for (i = 0; i < argc; i++) {
+	j = 0;
+	while (argv[i][j]) {
+	    if (argv[i][j] == '\t')
+		*cp++ = ' ';
+	    else
+		*cp++ = argv[i][j];
+	    j++;
+	}
+	*cp++ = ' ';
+    }
+    *cp++ = 0;
+
+    return str;
+}