comparison ezBAMQC/src/htslib/tabix.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* tabix.c -- Generic indexer for TAB-delimited genome position files.
2
3 Copyright (C) 2009-2011 Broad Institute.
4 Copyright (C) 2010-2012, 2014 Genome Research Ltd.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <string.h>
30 #include <getopt.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include <errno.h>
34 #include "htslib/tbx.h"
35 #include "htslib/sam.h"
36 #include "htslib/vcf.h"
37 #include "htslib/kseq.h"
38 #include "htslib/bgzf.h"
39 #include "htslib/hts.h"
40 #include "htslib/regidx.h"
41
42 typedef struct
43 {
44 char *regions_fname, *targets_fname;
45 int print_header, header_only;
46 }
47 args_t;
48
49 static void error(const char *format, ...)
50 {
51 va_list ap;
52 va_start(ap, format);
53 vfprintf(stderr, format, ap);
54 va_end(ap);
55 exit(EXIT_FAILURE);
56 }
57
58 #define IS_GFF (1<<0)
59 #define IS_BED (1<<1)
60 #define IS_SAM (1<<2)
61 #define IS_VCF (1<<3)
62 #define IS_BCF (1<<4)
63 #define IS_BAM (1<<5)
64 #define IS_CRAM (1<<6)
65 #define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
66
67 int file_type(const char *fname)
68 {
69 int l = strlen(fname);
70 int strcasecmp(const char *s1, const char *s2);
71 if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
72 else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
73 else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
74 else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
75 else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
76 else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
77 else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
78
79 htsFile *fp = hts_open(fname,"r");
80 enum htsExactFormat format = fp->format.format;
81 hts_close(fp);
82 if ( format == bcf ) return IS_BCF;
83 if ( format == bam ) return IS_BAM;
84 if ( format == cram ) return IS_CRAM;
85 if ( format == vcf ) return IS_VCF;
86
87 return 0;
88 }
89
90 static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
91 {
92 kstring_t str = {0,0,0};
93 int iseq = 0, ireg = 0;
94 char **regs = NULL;
95 *nregs = argc;
96
97 if ( regions_fname )
98 {
99 // improve me: this is a too heavy machinery for parsing regions...
100
101 regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
102 if ( !idx ) error("Could not read %s\n", regions_fname);
103
104 (*nregs) += regidx_nregs(idx);
105 regs = (char**) malloc(sizeof(char*)*(*nregs));
106
107 int nseq;
108 char **seqs = regidx_seq_names(idx, &nseq);
109 for (iseq=0; iseq<nseq; iseq++)
110 {
111 regitr_t itr;
112 regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
113 while ( itr.i < itr.n )
114 {
115 str.l = 0;
116 ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
117 regs[ireg++] = strdup(str.s);
118 itr.i++;
119 }
120 }
121 regidx_destroy(idx);
122 }
123 free(str.s);
124
125 if ( !ireg )
126 {
127 if ( argc )
128 regs = (char**) malloc(sizeof(char*)*argc);
129 else
130 {
131 regs = (char**) malloc(sizeof(char*));
132 regs[0] = strdup(".");
133 *nregs = 1;
134 }
135 }
136
137 for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
138 return regs;
139 }
140 static int query_regions(args_t *args, char *fname, char **regs, int nregs)
141 {
142 int i;
143 htsFile *fp = hts_open(fname,"r");
144 if ( !fp ) error("Could not read %s\n", fname);
145 enum htsExactFormat format = hts_get_format(fp)->format;
146
147 regidx_t *reg_idx = NULL;
148 if ( args->targets_fname )
149 {
150 reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
151 if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
152 }
153
154 if ( format == bcf )
155 {
156 htsFile *out = hts_open("-","w");
157 if ( !out ) error("Could not open stdout\n", fname);
158 hts_idx_t *idx = bcf_index_load(fname);
159 if ( !idx ) error("Could not load .csi index of %s\n", fname);
160 bcf_hdr_t *hdr = bcf_hdr_read(fp);
161 if ( !hdr ) error("Could not read the header: %s\n", fname);
162 if ( args->print_header )
163 bcf_hdr_write(out,hdr);
164 if ( !args->header_only )
165 {
166 bcf1_t *rec = bcf_init();
167 for (i=0; i<nregs; i++)
168 {
169 hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
170 while ( bcf_itr_next(fp, itr, rec) >=0 )
171 {
172 if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
173 bcf_write(out,hdr,rec);
174 }
175 tbx_itr_destroy(itr);
176 }
177 bcf_destroy(rec);
178 }
179 if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
180 bcf_hdr_destroy(hdr);
181 hts_idx_destroy(idx);
182 }
183 else if ( format==vcf || format==sam || format==unknown_format )
184 {
185 tbx_t *tbx = tbx_index_load(fname);
186 if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
187 kstring_t str = {0,0,0};
188 if ( args->print_header )
189 {
190 while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
191 {
192 if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
193 puts(str.s);
194 }
195 }
196 if ( !args->header_only )
197 {
198 int nseq;
199 const char **seq = NULL;
200 if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
201 for (i=0; i<nregs; i++)
202 {
203 hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
204 if ( !itr ) continue;
205 while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
206 {
207 if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
208 puts(str.s);
209 }
210 tbx_itr_destroy(itr);
211 }
212 free(seq);
213 }
214 free(str.s);
215 tbx_destroy(tbx);
216 }
217 else if ( format==bam )
218 error("Please use \"samtools view\" for querying BAM files.\n");
219
220 if ( reg_idx ) regidx_destroy(reg_idx);
221 if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
222
223 for (i=0; i<nregs; i++) free(regs[i]);
224 free(regs);
225 return 0;
226 }
227 static int query_chroms(char *fname)
228 {
229 const char **seq;
230 int i, nseq, ftype = file_type(fname);
231 if ( ftype & IS_TXT || !ftype )
232 {
233 tbx_t *tbx = tbx_index_load(fname);
234 if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
235 seq = tbx_seqnames(tbx, &nseq);
236 for (i=0; i<nseq; i++)
237 printf("%s\n", seq[i]);
238 free(seq);
239 tbx_destroy(tbx);
240 }
241 else if ( ftype==IS_BCF )
242 {
243 htsFile *fp = hts_open(fname,"r");
244 if ( !fp ) error("Could not read %s\n", fname);
245 bcf_hdr_t *hdr = bcf_hdr_read(fp);
246 if ( !hdr ) error("Could not read the header: %s\n", fname);
247 hts_close(fp);
248 hts_idx_t *idx = bcf_index_load(fname);
249 if ( !idx ) error("Could not load .csi index of %s\n", fname);
250 seq = bcf_index_seqnames(idx, hdr, &nseq);
251 for (i=0; i<nseq; i++)
252 printf("%s\n", seq[i]);
253 free(seq);
254 bcf_hdr_destroy(hdr);
255 hts_idx_destroy(idx);
256 }
257 else if ( ftype==IS_BAM ) // todo: BAM
258 error("BAM: todo\n");
259 return 0;
260 }
261
262 int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
263 {
264 if ( ftype & IS_TXT || !ftype )
265 {
266 BGZF *fp = bgzf_open(fname,"r");
267 if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
268
269 char *buffer = fp->uncompressed_block;
270 int skip_until = 0;
271
272 // Skip the header: find out the position of the data block
273 if ( buffer[0]==conf->meta_char )
274 {
275 skip_until = 1;
276 while (1)
277 {
278 if ( buffer[skip_until]=='\n' )
279 {
280 skip_until++;
281 if ( skip_until>=fp->block_length )
282 {
283 if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
284 skip_until = 0;
285 }
286 // The header has finished
287 if ( buffer[skip_until]!=conf->meta_char ) break;
288 }
289 skip_until++;
290 if ( skip_until>=fp->block_length )
291 {
292 if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
293 skip_until = 0;
294 }
295 }
296 }
297
298 // Output the new header
299 FILE *hdr = fopen(header,"r");
300 if ( !hdr ) error("%s: %s", header,strerror(errno));
301 int page_size = getpagesize();
302 char *buf = valloc(page_size);
303 BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
304 ssize_t nread;
305 while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
306 {
307 if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
308 if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
309 }
310 if ( fclose(hdr) ) error("close failed: %s\n", header);
311
312 // Output all remainig data read with the header block
313 if ( fp->block_length - skip_until > 0 )
314 {
315 if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
316 }
317 if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
318
319 while (1)
320 {
321 nread = bgzf_raw_read(fp, buf, page_size);
322 if ( nread<=0 ) break;
323
324 int count = bgzf_raw_write(bgzf_out, buf, nread);
325 if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
326 }
327 if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
328 if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
329 }
330 else
331 error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header.
332 return 0;
333 }
334
335 static int usage(void)
336 {
337 fprintf(stderr, "\n");
338 fprintf(stderr, "Version: %s\n", hts_version());
339 fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
340 fprintf(stderr, "\n");
341 fprintf(stderr, "Indexing Options:\n");
342 fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
343 fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
344 fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
345 fprintf(stderr, " -C, --csi generate CSI index for VCF (default is TBI)\n");
346 fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
347 fprintf(stderr, " -f, --force overwrite existing index without asking\n");
348 fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
349 fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf\n");
350 fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
351 fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
352 fprintf(stderr, "\n");
353 fprintf(stderr, "Querying and other options:\n");
354 fprintf(stderr, " -h, --print-header print also the header lines\n");
355 fprintf(stderr, " -H, --only-header print only the header lines\n");
356 fprintf(stderr, " -l, --list-chroms list chromosome names\n");
357 fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
358 fprintf(stderr, " -R, --regions FILE restrict to regions listed in the file\n");
359 fprintf(stderr, " -T, --targets FILE similar to -R but streams rather than index-jumps\n");
360 fprintf(stderr, "\n");
361 return 1;
362 }
363
364 int main(int argc, char *argv[])
365 {
366 int c, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
367 tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
368 char *reheader = NULL;
369 args_t args;
370 memset(&args,0,sizeof(args_t));
371
372 static struct option loptions[] =
373 {
374 {"help",0,0,'h'},
375 {"regions",1,0,'R'},
376 {"targets",1,0,'T'},
377 {"csi",0,0,'C'},
378 {"zero-based",0,0,'0'},
379 {"print-header",0,0,'h'},
380 {"only-header",0,0,'H'},
381 {"begin",1,0,'b'},
382 {"comment",1,0,'c'},
383 {"end",1,0,'e'},
384 {"force",0,0,'f'},
385 {"preset",1,0,'p'},
386 {"sequence",1,0,'s'},
387 {"skip-lines",1,0,'S'},
388 {"list-chroms",0,0,'l'},
389 {"reheader",1,0,'r'},
390 {0,0,0,0}
391 };
392
393 while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
394 {
395 switch (c)
396 {
397 case 'R': args.regions_fname = optarg; break;
398 case 'T': args.targets_fname = optarg; break;
399 case 'C': do_csi = 1; break;
400 case 'r': reheader = optarg; break;
401 case 'h': args.print_header = 1; break;
402 case 'H': args.header_only = 1; break;
403 case 'l': list_chroms = 1; break;
404 case '0': conf.preset |= TBX_UCSC; break;
405 case 'b': conf.bc = atoi(optarg); break;
406 case 'e': conf.ec = atoi(optarg); break;
407 case 'c': conf.meta_char = *optarg; break;
408 case 'f': is_force = 1; break;
409 case 'm': min_shift = atoi(optarg); break;
410 case 'p':
411 if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
412 else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
413 else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
414 else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
415 else if (strcmp(optarg, "bcf") == 0) ; // bcf is autodetected, preset is not needed
416 else if (strcmp(optarg, "bam") == 0) ; // same as bcf
417 else error("The preset string not recognised: '%s'\n", optarg);
418 break;
419 case 's': conf.sc = atoi(optarg); break;
420 case 'S': conf.line_skip = atoi(optarg); break;
421 default: return usage();
422 }
423 }
424
425 if ( optind==argc ) return usage();
426
427 if ( list_chroms )
428 return query_chroms(argv[optind]);
429
430 if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
431 {
432 int nregs = 0;
433 char **regs = NULL;
434 if ( !args.header_only )
435 regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
436 return query_regions(&args, argv[optind], regs, nregs);
437 }
438
439 char *fname = argv[optind];
440 int ftype = file_type(fname);
441 if ( !conf_ptr ) // no preset given
442 {
443 if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff;
444 else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed;
445 else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam;
446 else if ( ftype==IS_VCF )
447 {
448 conf_ptr = &tbx_conf_vcf;
449 if ( !min_shift && do_csi ) min_shift = 14;
450 }
451 else if ( ftype==IS_BCF )
452 {
453 if ( !min_shift ) min_shift = 14;
454 }
455 else if ( ftype==IS_BAM )
456 {
457 if ( !min_shift ) min_shift = 14;
458 }
459 }
460 if ( do_csi )
461 {
462 if ( !min_shift ) min_shift = 14;
463 min_shift *= do_csi; // positive for CSIv2, negative for CSIv1
464 }
465 if ( min_shift!=0 && !do_csi ) do_csi = 1;
466
467 if ( reheader )
468 return reheader_file(fname, reheader, ftype, conf_ptr);
469
470 if ( conf_ptr )
471 conf = *conf_ptr;
472
473 char *suffix = ".tbi";
474 if ( do_csi ) suffix = ".csi";
475 else if ( ftype==IS_BAM ) suffix = ".bai";
476 else if ( ftype==IS_CRAM ) suffix = ".crai";
477
478 char *idx_fname = calloc(strlen(fname) + 5, 1);
479 strcat(strcpy(idx_fname, fname), suffix);
480
481 struct stat stat_tbi, stat_file;
482 if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
483 {
484 // Before complaining about existing index, check if the VCF file isn't
485 // newer. This is a common source of errors, people tend not to notice
486 // that tabix failed
487 stat(fname, &stat_file);
488 if ( stat_file.st_mtime <= stat_tbi.st_mtime )
489 error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
490 }
491 free(idx_fname);
492
493 if ( ftype==IS_CRAM )
494 {
495 if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
496 return 0;
497 }
498 else if ( do_csi )
499 {
500 if ( ftype==IS_BCF )
501 {
502 if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
503 return 0;
504 }
505 if ( ftype==IS_BAM )
506 {
507 if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
508 return 0;
509 }
510 if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
511 return 0;
512 }
513 else // TBI index
514 {
515 if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
516 return 0;
517 }
518 return 0;
519 }