Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/sam_header.h @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* | |
| 2 Copyright (c) 2013-2014 Genome Research Ltd. | |
| 3 Author: James Bonfield <jkb@sanger.ac.uk> | |
| 4 | |
| 5 Redistribution and use in source and binary forms, with or without | |
| 6 modification, are permitted provided that the following conditions are met: | |
| 7 | |
| 8 1. Redistributions of source code must retain the above copyright notice, | |
| 9 this list of conditions and the following disclaimer. | |
| 10 | |
| 11 2. Redistributions in binary form must reproduce the above copyright notice, | |
| 12 this list of conditions and the following disclaimer in the documentation | |
| 13 and/or other materials provided with the distribution. | |
| 14 | |
| 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
| 16 Institute nor the names of its contributors may be used to endorse or promote | |
| 17 products derived from this software without specific prior written permission. | |
| 18 | |
| 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
| 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
| 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
| 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 /*! \file | |
| 32 * SAM header parsing. | |
| 33 * | |
| 34 * These functions can be shared between SAM, BAM and CRAM file | |
| 35 * formats as all three internally use the same string encoding for | |
| 36 * header fields. | |
| 37 */ | |
| 38 | |
| 39 /* | |
| 40 * TODO. | |
| 41 * | |
| 42 * - Sort order (parse to struct, enum type, updating funcs) | |
| 43 * - Removal of lines. | |
| 44 * - Updating of lines | |
| 45 */ | |
| 46 | |
| 47 #ifndef _SAM_HDR_H_ | |
| 48 #define _SAM_HDR_H_ | |
| 49 | |
| 50 #ifdef __cplusplus | |
| 51 extern "C" { | |
| 52 #endif | |
| 53 | |
| 54 #ifdef HAVE_CONFIG_H | |
| 55 #include "io_lib_config.h" | |
| 56 #endif | |
| 57 | |
| 58 #include <stdarg.h> | |
| 59 | |
| 60 #include "cram/string_alloc.h" | |
| 61 #include "cram/pooled_alloc.h" | |
| 62 | |
| 63 #include "htslib/khash.h" | |
| 64 #include "htslib/kstring.h" | |
| 65 | |
| 66 // For structure assignment. Eg kstring_t s = KS_INITIALIZER; | |
| 67 #define KS_INITIALIZER {0,0,0} | |
| 68 | |
| 69 // For initialisation elsewhere. Eg KS_INIT(x->str); | |
| 70 #define KS_INIT(ks) ((ks)->l = 0, (ks)->m = 0, (ks)->s = NULL) | |
| 71 | |
| 72 // Frees the string subfield only. Assumes 's' itself is static. | |
| 73 #define KS_FREE(ks) do { if ((ks)->s) free((ks)->s); } while(0) | |
| 74 | |
| 75 /* | |
| 76 * Proposed new SAM header parsing | |
| 77 | |
| 78 1 @SQ ID:foo LN:100 | |
| 79 2 @SQ ID:bar LN:200 | |
| 80 3 @SQ ID:ram LN:300 UR:xyz | |
| 81 4 @RG ID:r ... | |
| 82 5 @RG ID:s ... | |
| 83 | |
| 84 Hash table for 2-char @keys without dup entries. | |
| 85 If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. | |
| 86 | |
| 87 HASH("SQ")--\ | |
| 88 | | |
| 89 (3) <-> 1 <-> 2 <-> 3 <-> (1) | |
| 90 | |
| 91 HASH("RG")--\ | |
| 92 | | |
| 93 (5) <-> 4 <-> 5 <-> (4) | |
| 94 | |
| 95 Items stored in the hash values also form their own linked lists: | |
| 96 Ie SQ->ID(foo)->LN(100) | |
| 97 SQ->ID(bar)->LN(200) | |
| 98 SQ->ID(ram)->LN(300)->UR(xyz) | |
| 99 RG->ID(r) | |
| 100 */ | |
| 101 | |
| 102 /*! A single key:value pair on a header line | |
| 103 * | |
| 104 * These form a linked list and hold strings. The strings are | |
| 105 * allocated from a string_alloc_t pool referenced in the master | |
| 106 * SAM_hdr structure. Do not attempt to free, malloc or manipulate | |
| 107 * these strings directly. | |
| 108 */ | |
| 109 typedef struct SAM_hdr_tag_s { | |
| 110 struct SAM_hdr_tag_s *next; | |
| 111 char *str; | |
| 112 int len; | |
| 113 } SAM_hdr_tag; | |
| 114 | |
| 115 /*! The parsed version of the SAM header string. | |
| 116 * | |
| 117 * Each header type (SQ, RG, HD, etc) points to its own SAM_hdr_type | |
| 118 * struct via the main hash table h in the SAM_hdr struct. | |
| 119 * | |
| 120 * These in turn consist of circular bi-directional linked lists (ie | |
| 121 * rings) to hold the multiple instances of the same header type | |
| 122 * code. For example if we have 5 \@SQ lines the primary hash table | |
| 123 * will key on \@SQ pointing to the first SAM_hdr_type and that in turn | |
| 124 * will be part of a ring of 5 elements. | |
| 125 * | |
| 126 * For each SAM_hdr_type structure we also point to a SAM_hdr_tag | |
| 127 * structure which holds the tokenised attributes; the tab separated | |
| 128 * key:value pairs per line. | |
| 129 */ | |
| 130 typedef struct SAM_hdr_item_s { | |
| 131 struct SAM_hdr_item_s *next; // cirular | |
| 132 struct SAM_hdr_item_s *prev; | |
| 133 SAM_hdr_tag *tag; // first tag | |
| 134 int order; // 0 upwards | |
| 135 } SAM_hdr_type; | |
| 136 | |
| 137 /*! Parsed \@SQ lines */ | |
| 138 typedef struct { | |
| 139 char *name; | |
| 140 uint32_t len; | |
| 141 SAM_hdr_type *ty; | |
| 142 SAM_hdr_tag *tag; | |
| 143 } SAM_SQ; | |
| 144 | |
| 145 /*! Parsed \@RG lines */ | |
| 146 typedef struct { | |
| 147 char *name; | |
| 148 SAM_hdr_type *ty; | |
| 149 SAM_hdr_tag *tag; | |
| 150 int name_len; | |
| 151 int id; // numerical ID | |
| 152 } SAM_RG; | |
| 153 | |
| 154 /*! Parsed \@PG lines */ | |
| 155 typedef struct { | |
| 156 char *name; | |
| 157 SAM_hdr_type *ty; | |
| 158 SAM_hdr_tag *tag; | |
| 159 int name_len; | |
| 160 int id; // numerical ID | |
| 161 int prev_id; // -1 if none | |
| 162 } SAM_PG; | |
| 163 | |
| 164 KHASH_MAP_INIT_INT(sam_hdr, SAM_hdr_type*) | |
| 165 KHASH_MAP_INIT_STR(m_s2i, int) | |
| 166 | |
| 167 /*! Primary structure for header manipulation | |
| 168 * | |
| 169 * The initial header text is held in the text kstring_t, but is also | |
| 170 * parsed out into SQ, RG and PG arrays. These have a hash table | |
| 171 * associated with each to allow lookup by ID or SN fields instead of | |
| 172 * their numeric array indices. Additionally PG has an array to hold | |
| 173 * the linked list start points (the last in a PP chain). | |
| 174 * | |
| 175 * Use the appropriate sam_hdr_* functions to edit the header, and | |
| 176 * call sam_hdr_rebuild() any time the textual form needs to be | |
| 177 * updated again. | |
| 178 */ | |
| 179 typedef struct { | |
| 180 kstring_t text; //!< concatenated text, indexed by SAM_hdr_tag | |
| 181 khash_t(sam_hdr) *h; | |
| 182 string_alloc_t *str_pool; //!< Pool of SAM_hdr_tag->str strings | |
| 183 pool_alloc_t *type_pool;//!< Pool of SAM_hdr_type structs | |
| 184 pool_alloc_t *tag_pool; //!< Pool of SAM_hdr_tag structs | |
| 185 | |
| 186 // @SQ lines / references | |
| 187 int nref; //!< Number of \@SQ lines | |
| 188 SAM_SQ *ref; //!< Array of parsed \@SQ lines | |
| 189 khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to sq[] index | |
| 190 | |
| 191 // @RG lines / read-groups | |
| 192 int nrg; //!< Number of \@RG lines | |
| 193 SAM_RG *rg; //!< Array of parsed \@RG lines | |
| 194 khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index | |
| 195 | |
| 196 // @PG lines / programs | |
| 197 int npg; //!< Number of \@PG lines | |
| 198 int npg_end; //!< Number of terminating \@PG lines | |
| 199 int npg_end_alloc; //!< Size of pg_end field | |
| 200 SAM_PG *pg; //!< Array of parsed \@PG lines | |
| 201 khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index | |
| 202 int *pg_end; //!< \@PG chain termination IDs | |
| 203 | |
| 204 // @cond internal | |
| 205 char ID_buf[1024]; // temporary buffer | |
| 206 int ID_cnt; | |
| 207 int ref_count; // number of uses of this SAM_hdr | |
| 208 // @endcond | |
| 209 } SAM_hdr; | |
| 210 | |
| 211 /*! Creates an empty SAM header, ready to be populated. | |
| 212 * | |
| 213 * @return | |
| 214 * Returns a SAM_hdr struct on success (free with sam_hdr_free()) | |
| 215 * NULL on failure | |
| 216 */ | |
| 217 SAM_hdr *sam_hdr_new(void); | |
| 218 | |
| 219 /*! Tokenises a SAM header into a hash table. | |
| 220 * | |
| 221 * Also extracts a few bits on specific data types, such as @RG lines. | |
| 222 * | |
| 223 * @return | |
| 224 * Returns a SAM_hdr struct on success (free with sam_hdr_free()); | |
| 225 * NULL on failure | |
| 226 */ | |
| 227 SAM_hdr *sam_hdr_parse_(const char *hdr, int len); | |
| 228 | |
| 229 | |
| 230 /*! Produces a duplicate copy of hdr and returns it. | |
| 231 * @return | |
| 232 * Returns NULL on failure | |
| 233 */ | |
| 234 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr); | |
| 235 | |
| 236 | |
| 237 /*! Increments a reference count on hdr. | |
| 238 * | |
| 239 * This permits multiple files to share the same header, all calling | |
| 240 * sam_hdr_free when done, without causing errors for other open files. | |
| 241 */ | |
| 242 void sam_hdr_incr_ref(SAM_hdr *hdr); | |
| 243 | |
| 244 | |
| 245 /*! Increments a reference count on hdr. | |
| 246 * | |
| 247 * This permits multiple files to share the same header, all calling | |
| 248 * sam_hdr_free when done, without causing errors for other open files. | |
| 249 * | |
| 250 * If the reference count hits zero then the header is automatically | |
| 251 * freed. This makes it a synonym for sam_hdr_free(). | |
| 252 */ | |
| 253 void sam_hdr_decr_ref(SAM_hdr *hdr); | |
| 254 | |
| 255 | |
| 256 /*! Deallocates all storage used by a SAM_hdr struct. | |
| 257 * | |
| 258 * This also decrements the header reference count. If after decrementing | |
| 259 * it is still non-zero then the header is assumed to be in use by another | |
| 260 * caller and the free is not done. | |
| 261 * | |
| 262 * This is a synonym for sam_hdr_dec_ref(). | |
| 263 */ | |
| 264 void sam_hdr_free(SAM_hdr *hdr); | |
| 265 | |
| 266 /*! Returns the current length of the SAM_hdr in text form. | |
| 267 * | |
| 268 * Call sam_hdr_rebuild() first if editing has taken place. | |
| 269 */ | |
| 270 int sam_hdr_length(SAM_hdr *hdr); | |
| 271 | |
| 272 /*! Returns the string form of the SAM_hdr. | |
| 273 * | |
| 274 * Call sam_hdr_rebuild() first if editing has taken place. | |
| 275 */ | |
| 276 char *sam_hdr_str(SAM_hdr *hdr); | |
| 277 | |
| 278 /*! Appends a formatted line to an existing SAM header. | |
| 279 * | |
| 280 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with | |
| 281 * optional new-line. If it contains more than 1 line then multiple lines | |
| 282 * will be added in order. | |
| 283 * | |
| 284 * Len is the length of the text data, or 0 if unknown (in which case | |
| 285 * it should be null terminated). | |
| 286 * | |
| 287 * @return | |
| 288 * Returns 0 on success; | |
| 289 * -1 on failure | |
| 290 */ | |
| 291 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len); | |
| 292 | |
| 293 /*! Adds a single line to a SAM header. | |
| 294 * | |
| 295 * Specify type and one or more key,value pairs, ending with the NULL key. | |
| 296 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). | |
| 297 * | |
| 298 * @return | |
| 299 * Returns 0 on success; | |
| 300 * -1 on failure | |
| 301 */ | |
| 302 int sam_hdr_add(SAM_hdr *sh, const char *type, ...); | |
| 303 | |
| 304 /*! Adds a single line to a SAM header. | |
| 305 * | |
| 306 * This is much like sam_hdr_add() but with the additional va_list | |
| 307 * argument. This is followed by specifying type and one or more | |
| 308 * key,value pairs, ending with the NULL key. | |
| 309 * | |
| 310 * Eg. sam_hdr_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL). | |
| 311 * | |
| 312 * The purpose of the additional va_list parameter is to permit other | |
| 313 * varargs functions to call this while including their own additional | |
| 314 * parameters; an example is in sam_hdr_add_PG(). | |
| 315 * | |
| 316 * @return | |
| 317 * Returns 0 on success; | |
| 318 * -1 on failure | |
| 319 */ | |
| 320 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...); | |
| 321 | |
| 322 /*! | |
| 323 * @return | |
| 324 * Returns the first header item matching 'type'. If ID is non-NULL it checks | |
| 325 * for the tag ID: and compares against the specified ID. | |
| 326 * | |
| 327 * Returns NULL if no type/ID is found | |
| 328 */ | |
| 329 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, | |
| 330 char *ID_key, char *ID_value); | |
| 331 | |
| 332 /*! | |
| 333 * | |
| 334 * As per SAM_hdr_type, but returns a complete line of formatted text | |
| 335 * for a specific head type/ID combination. If ID is NULL then it returns | |
| 336 * the first line of the specified type. | |
| 337 * | |
| 338 * The returned string is malloced and should be freed by the calling | |
| 339 * function with free(). | |
| 340 * | |
| 341 * @return | |
| 342 * Returns NULL if no type/ID is found. | |
| 343 */ | |
| 344 char *sam_hdr_find_line(SAM_hdr *hdr, char *type, | |
| 345 char *ID_key, char *ID_value); | |
| 346 | |
| 347 /*! Looks for a specific key in a single sam header line. | |
| 348 * | |
| 349 * If prev is non-NULL it also fills this out with the previous tag, to | |
| 350 * permit use in key removal. *prev is set to NULL when the tag is the first | |
| 351 * key in the list. When a tag isn't found, prev (if non NULL) will be the last | |
| 352 * tag in the existing list. | |
| 353 * | |
| 354 * @return | |
| 355 * Returns the tag pointer on success; | |
| 356 * NULL on failure | |
| 357 */ | |
| 358 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, | |
| 359 SAM_hdr_type *type, | |
| 360 char *key, | |
| 361 SAM_hdr_tag **prev); | |
| 362 | |
| 363 /*! Adds or updates tag key,value pairs in a header line. | |
| 364 * | |
| 365 * Eg for adding M5 tags to @SQ lines or updating sort order for the | |
| 366 * @HD line (although use the sam_hdr_sort_order() function for | |
| 367 * HD manipulation, which is a wrapper around this funuction). | |
| 368 * | |
| 369 * Specify multiple key,value pairs ending in NULL. | |
| 370 * | |
| 371 * @return | |
| 372 * Returns 0 on success; | |
| 373 * -1 on failure | |
| 374 */ | |
| 375 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...); | |
| 376 | |
| 377 /*! Reconstructs the kstring from the header hash table. | |
| 378 * @return | |
| 379 * Returns 0 on success; | |
| 380 * -1 on failure | |
| 381 */ | |
| 382 int sam_hdr_rebuild(SAM_hdr *hdr); | |
| 383 | |
| 384 /*! Looks up a reference sequence by name and returns the numerical ID. | |
| 385 * @return | |
| 386 * Returns -1 if unknown reference. | |
| 387 */ | |
| 388 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref); | |
| 389 | |
| 390 /*! Looks up a read-group by name and returns a pointer to the start of the | |
| 391 * associated tag list. | |
| 392 * | |
| 393 * @return | |
| 394 * Returns NULL on failure | |
| 395 */ | |
| 396 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg); | |
| 397 | |
| 398 /*! Fixes any PP links in @PG headers. | |
| 399 * | |
| 400 * If the entries are in order then this doesn't need doing, but incase | |
| 401 * our header is out of order this goes through the sh->pg[] array | |
| 402 * setting the prev_id field. | |
| 403 * | |
| 404 * @return | |
| 405 * Returns 0 on sucess; | |
| 406 * -1 on failure (indicating broken PG/PP records) | |
| 407 */ | |
| 408 int sam_hdr_link_pg(SAM_hdr *hdr); | |
| 409 | |
| 410 | |
| 411 /*! Add an @PG line. | |
| 412 * | |
| 413 * If we wish complete control over this use sam_hdr_add() directly. This | |
| 414 * function uses that, but attempts to do a lot of tedious house work for | |
| 415 * you too. | |
| 416 * | |
| 417 * - It will generate a suitable ID if the supplied one clashes. | |
| 418 * - It will generate multiple @PG records if we have multiple PG chains. | |
| 419 * | |
| 420 * Call it as per sam_hdr_add() with a series of key,value pairs ending | |
| 421 * in NULL. | |
| 422 * | |
| 423 * @return | |
| 424 * Returns 0 on success; | |
| 425 * -1 on failure | |
| 426 */ | |
| 427 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...); | |
| 428 | |
| 429 /*! | |
| 430 * A function to help with construction of CL tags in @PG records. | |
| 431 * Takes an argc, argv pair and returns a single space-separated string. | |
| 432 * This string should be deallocated by the calling function. | |
| 433 * | |
| 434 * @return | |
| 435 * Returns malloced char * on success; | |
| 436 * NULL on failure | |
| 437 */ | |
| 438 char *stringify_argv(int argc, char *argv[]); | |
| 439 | |
| 440 #ifdef __cplusplus | |
| 441 } | |
| 442 #endif | |
| 443 | |
| 444 #endif /* _SAM_HDR_H_ */ |
