annotate ezBAMQC/src/htslib/hts.c @ 9:6610eedd9fae

Uploaded
author cshl-bsr
date Wed, 30 Mar 2016 12:11:46 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Copyright (C) 2008, 2009, 2012-2015 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4 Copyright (C) 2012, 2013 Broad Institute.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6 Author: Heng Li <lh3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24 DEALINGS IN THE SOFTWARE. */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 #include <zlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 #include <ctype.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #include <stdio.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #include <string.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30 #include <stdlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #include <limits.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 #include <fcntl.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 #include <errno.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 #include <sys/stat.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 #include "htslib/bgzf.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36 #include "htslib/hts.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 #include "cram/cram.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 #include "htslib/hfile.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 #include "version.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 #include "htslib/kseq.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 #define KS_BGZF 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 // bgzf now supports gzip-compressed files, the gzFile branch can be removed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 KSTREAM_INIT2(, BGZF*, bgzf_read, 65536)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 KSTREAM_INIT2(, gzFile, gzread, 16384)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50 #include "htslib/khash.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 int hts_verbose = 3;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 const char *hts_version()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 return HTS_VERSION;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 const unsigned char seq_nt16_table[256] = {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 *** Basic file I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 switch (fmt) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 case sam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 return sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 case vcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 case bcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 return variant_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 case bai:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 case crai:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 case csi:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 case gzi:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 case tbi:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 return index_file;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 case bed:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 return region_list;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 case unknown_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 case binary_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 case text_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 case format_maximum:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 return unknown_category;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 // Decompress up to ten or so bytes by peeking at the file, which must be
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 // positioned at the start of a GZIP block.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 // Typically at most a couple of hundred bytes of input are required
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 // to get a few bytes of output from inflate(), so hopefully this buffer
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 // size suffices in general.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 unsigned char buffer[512];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 z_stream zs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131 if (npeek < 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 zs.zalloc = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 zs.zfree = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 zs.next_in = buffer;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 zs.avail_in = npeek;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 zs.next_out = dest;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 zs.avail_out = destsize;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 if (inflateInit2(&zs, 31) != Z_OK) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 while (zs.total_out < destsize)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 destsize = zs.total_out;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 inflateEnd(&zs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 return destsize;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 // Parse "x.y" text, taking care because the string is not NUL-terminated
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 // and filling in major/minor only when the digits are followed by a delimiter,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 static void
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 const char *str = (const char *) u;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 const char *slim = (const char *) ulim;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 const char *s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 fmt->version.major = fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 for (s = str; s < slim; s++) if (!isdigit(*s)) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 if (s < slim) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 fmt->version.major = atoi(str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 if (*s == '.') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166 str = &s[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 for (s = str; s < slim; s++) if (!isdigit(*s)) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 if (s < slim)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 fmt->version.minor = atoi(str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 fmt->version.minor = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 unsigned char s[21];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 ssize_t len = hpeek(hfile, s, 18);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 if (len < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183 // The stream is either gzip-compressed or BGZF-compressed.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 // Determine which, and decompress the first few bytes.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 fmt->compression = (len >= 18 && (s[3] & 4) &&
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 len = decompress_peek(hfile, s, sizeof s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 fmt->compression = no_compression;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 len = hpeek(hfile, s, sizeof s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 if (len < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 fmt->compression_level = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 fmt->specific = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 fmt->category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 fmt->format = cram;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 fmt->version.major = s[4], fmt->version.minor = s[5];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 fmt->compression = custom;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 else if (len >= 4 && s[3] <= '\4') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 if (memcmp(s, "BAM\1", 4) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207 fmt->category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 fmt->format = bam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 // TODO Decompress enough to pick version from @HD-VN header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 fmt->version.major = 1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 else if (memcmp(s, "BAI\1", 4) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 fmt->category = index_file;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 fmt->format = bai;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 fmt->version.major = -1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 else if (memcmp(s, "BCF\4", 4) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 fmt->category = variant_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 fmt->format = bcf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 fmt->version.major = 1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 else if (memcmp(s, "BCF\2", 4) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 fmt->category = variant_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 fmt->format = bcf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 fmt->version.major = s[3];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 else if (memcmp(s, "CSI\1", 4) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 fmt->category = index_file;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 fmt->format = csi;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 fmt->version.major = 1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 else if (memcmp(s, "TBI\1", 4) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 fmt->category = index_file;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 fmt->format = tbi;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241 fmt->version.major = -1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 fmt->category = variant_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 fmt->format = vcf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 if (len >= 21 && s[16] == 'v')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 parse_version(fmt, &s[17], &s[len]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251 fmt->version.major = fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 else if (len >= 4 && s[0] == '@' &&
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 fmt->category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258 fmt->format = sam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 // @HD-VN is not guaranteed to be the first tag, but then @HD is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 // not guaranteed to be present at all...
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262 parse_version(fmt, &s[7], &s[len]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 fmt->version.major = 1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267 else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268 // Various possibilities for tab-delimited text:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 // .crai (gzipped tab-delimited six columns: seqid 5*number)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 // .bed ([3..12] tab-delimited columns)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 // .bedpe (>= 10 tab-delimited columns)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 // .sam (tab-delimited >= 11 columns: seqid number seqid...)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 // FIXME For now, assume it's SAM
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 fmt->category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 fmt->format = sam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 fmt->version.major = 1, fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 fmt->category = unknown_category;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 fmt->format = unknown_format;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 fmt->version.major = fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 fmt->compression = no_compression;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 char *hts_format_description(const htsFormat *format)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289 kstring_t str = { 0, 0, NULL };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291 switch (format->format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 case sam: kputs("SAM", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 case bam: kputs("BAM", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 case cram: kputs("CRAM", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 case vcf: kputs("VCF", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 case bcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 if (format->version.major == 1) kputs("Legacy BCF", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 else kputs("BCF", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300 case bai: kputs("BAI", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 case crai: kputs("CRAI", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 case csi: kputs("CSI", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 case tbi: kputs("Tabix", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304 default: kputs("unknown", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307 if (format->version.major >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 kputs(" version ", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 kputw(format->version.major, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 if (format->version.minor >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 kputc('.', &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 kputw(format->version.minor, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 switch (format->compression) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 case custom: kputs(" compressed", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 case gzip: kputs(" gzip-compressed", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 case bgzf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 switch (format->format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 case bcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323 case csi:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 case tbi:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325 // These are by definition BGZF, so just use the generic term
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 kputs(" compressed", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 kputs(" BGZF-compressed", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 default: break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 switch (format->category) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337 case sequence_data: kputs(" sequence", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338 case variant_data: kputs(" variant calling", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
339 case index_file: kputs(" index", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
340 case region_list: kputs(" genomic region", &str); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
341 default: break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
342 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
343
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
344 if (format->compression == no_compression)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
345 switch (format->format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
346 case sam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
347 case crai:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
348 case vcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
349 case bed:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
350 kputs(" text", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
351 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
352
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
353 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
354 kputs(" data", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
355 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
356 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
357 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
358 kputs(" data", &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
359
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
360 return ks_release(&str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
361 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
362
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
363 htsFile *hts_open(const char *fn, const char *mode)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
364 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
365 htsFile *fp = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
366 hFILE *hfile = hopen(fn, mode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
367 if (hfile == NULL) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
368
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
369 fp = hts_hopen(hfile, fn, mode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
370 if (fp == NULL) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
371
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
372 return fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
373
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
374 error:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
375 if (hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
376 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
377
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
378 if (hfile)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
379 hclose_abruptly(hfile);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
380
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
381 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
382 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
383
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
384 htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
385 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
386 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
387 if (fp == NULL) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
388
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
389 fp->fn = strdup(fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
390 fp->is_be = ed_is_big();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
391
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
392 if (strchr(mode, 'r')) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
393 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
394 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
395 else if (strchr(mode, 'w') || strchr(mode, 'a')) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
396 htsFormat *fmt = &fp->format;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
397 fp->is_write = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
398
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
399 if (strchr(mode, 'b')) fmt->format = binary_format;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
400 else if (strchr(mode, 'c')) fmt->format = cram;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
401 else fmt->format = text_format;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
402
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
403 if (strchr(mode, 'z')) fmt->compression = bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
404 else if (strchr(mode, 'g')) fmt->compression = gzip;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
405 else if (strchr(mode, 'u')) fmt->compression = no_compression;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
406 else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
407 // No compression mode specified, set to the default for the format
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
408 switch (fmt->format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
409 case binary_format: fmt->compression = bgzf; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
410 case cram: fmt->compression = custom; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
411 case text_format: fmt->compression = no_compression; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
412 default: abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
413 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
414 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
415
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
416 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
417 fmt->category = format_category(fmt->format);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
418
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
419 fmt->version.major = fmt->version.minor = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
420 fmt->compression_level = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
421 fmt->specific = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
422 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
423 else goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
424
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
425 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
426 case binary_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
427 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
428 case bcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
429 fp->fp.bgzf = bgzf_hopen(hfile, mode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
430 if (fp->fp.bgzf == NULL) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
431 fp->is_bin = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
432 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
433
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
434 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
435 fp->fp.cram = cram_dopen(hfile, fn, mode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
436 if (fp->fp.cram == NULL) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
437 if (!fp->is_write)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
438 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
439 fp->is_cram = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
440 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
441
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
442 case text_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
443 case sam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
444 case vcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
445 if (!fp->is_write) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
446 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
447 BGZF *gzfp = bgzf_hopen(hfile, mode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
448 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
449 // TODO Implement gzip hFILE adaptor
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
450 hclose(hfile); // This won't work, especially for stdin
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
451 gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
452 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
453 if (gzfp) fp->fp.voidp = ks_init(gzfp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
454 else goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
455 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
456 else if (fp->format.compression != no_compression) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
457 fp->fp.bgzf = bgzf_hopen(hfile, mode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
458 if (fp->fp.bgzf == NULL) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
459 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
460 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
461 fp->fp.hfile = hfile;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
462 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
463
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
464 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
465 goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
466 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
467
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
468 return fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
469
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
470 error:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
471 if (hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
472 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
473
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
474 if (fp) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
475 free(fp->fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
476 free(fp->fn_aux);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
477 free(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
478 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
479 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
480 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
481
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
482 int hts_close(htsFile *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
483 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
484 int ret, save;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
485
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
486 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
487 case binary_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
488 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
489 case bcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
490 ret = bgzf_close(fp->fp.bgzf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
491 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
492
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
493 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
494 if (!fp->is_write) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
495 switch (cram_eof(fp->fp.cram)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
496 case 0:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
497 fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
498 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
499 case 2:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
500 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
501 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
502 default: /* case 1, expected EOF */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
503 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
504 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
505 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
506 ret = cram_close(fp->fp.cram);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
507 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
508
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
509 case text_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
510 case sam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
511 case vcf:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
512 if (!fp->is_write) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
513 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
514 BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
515 ret = bgzf_close(gzfp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
516 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
517 gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
518 ret = gzclose(gzfp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
519 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
520 ks_destroy((kstream_t*)fp->fp.voidp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
521 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
522 else if (fp->format.compression != no_compression)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
523 ret = bgzf_close(fp->fp.bgzf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
524 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
525 ret = hclose(fp->fp.hfile);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
526 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
527
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
528 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
529 ret = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
530 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
531 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
532
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
533 save = errno;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
534 free(fp->fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
535 free(fp->fn_aux);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
536 free(fp->line.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
537 free(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
538 errno = save;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
539 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
540 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
541
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
542 const htsFormat *hts_get_format(htsFile *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
543 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
544 return fp? &fp->format : NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
545 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
546
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
547 int hts_set_opt(htsFile *fp, enum cram_option opt, ...) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
548 int r;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
549 va_list args;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
550
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
551 if (fp->format.format != cram)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
552 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
553
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
554 va_start(args, opt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
555 r = cram_set_voption(fp->fp.cram, opt, args);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
556 va_end(args);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
557
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
558 return r;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
559 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
560
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
561 int hts_set_threads(htsFile *fp, int n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
562 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
563 if (fp->format.compression == bgzf) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
564 return bgzf_mt(fp->fp.bgzf, n, 256);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
565 } else if (fp->format.format == cram) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
566 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
567 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
568 else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
569 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
570
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
571 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
572 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
573 free(fp->fn_aux);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
574 if (fn_aux) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
575 fp->fn_aux = strdup(fn_aux);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
576 if (fp->fn_aux == NULL) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
577 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
578 else fp->fn_aux = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
579
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
580 if (fp->format.format == cram)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
581 cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
582
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
583 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
584 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
585
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
586 // For VCF/BCF backward sweeper. Not exposing these functions because their
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
587 // future is uncertain. Things will probably have to change with hFILE...
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
588 BGZF *hts_get_bgzfp(htsFile *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
589 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
590 if ( fp->is_bin )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
591 return fp->fp.bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
592 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
593 return ((kstream_t*)fp->fp.voidp)->f;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
594 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
595 int hts_useek(htsFile *fp, long uoffset, int where)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
596 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
597 if ( fp->is_bin )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
598 return bgzf_useek(fp->fp.bgzf, uoffset, where);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
599 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
600 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
601 ks_rewind((kstream_t*)fp->fp.voidp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
602 ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
603 return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
604 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
605 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
606 long hts_utell(htsFile *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
607 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
608 if ( fp->is_bin )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
609 return bgzf_utell(fp->fp.bgzf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
610 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
611 return ((kstream_t*)fp->fp.voidp)->seek_pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
612 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
613
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
614 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
615 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
616 int ret, dret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
617 ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
618 ++fp->lineno;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
619 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
620 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
621
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
622 char **hts_readlist(const char *string, int is_file, int *_n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
623 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
624 int m = 0, n = 0, dret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
625 char **s = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
626 if ( is_file )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
627 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
628 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
629 BGZF *fp = bgzf_open(string, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
630 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
631 gzFile fp = gzopen(string, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
632 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
633 if ( !fp ) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
634
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
635 kstream_t *ks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
636 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
637 str.s = 0; str.l = str.m = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
638 ks = ks_init(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
639 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
640 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
641 if (str.l == 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
642 n++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
643 hts_expand(char*,n,m,s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
644 s[n-1] = strdup(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
645 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
646 ks_destroy(ks);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
647 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
648 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
649 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
650 gzclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
651 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
652 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
653 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
654 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
655 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
656 const char *q = string, *p = string;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
657 while ( 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
658 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
659 if (*p == ',' || *p == 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
660 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
661 n++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
662 hts_expand(char*,n,m,s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
663 s[n-1] = (char*)calloc(p - q + 1, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
664 strncpy(s[n-1], q, p - q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
665 q = p + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
666 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
667 if ( !*p ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
668 p++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
669 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
670 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
671 s = (char**)realloc(s, n * sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
672 *_n = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
673 return s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
674 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
675
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
676 char **hts_readlines(const char *fn, int *_n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
677 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
678 int m = 0, n = 0, dret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
679 char **s = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
680 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
681 BGZF *fp = bgzf_open(fn, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
682 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
683 gzFile fp = gzopen(fn, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
684 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
685 if ( fp ) { // read from file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
686 kstream_t *ks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
687 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
688 str.s = 0; str.l = str.m = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
689 ks = ks_init(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
690 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
691 if (str.l == 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
692 if (m == n) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
693 m = m? m<<1 : 16;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
694 s = (char**)realloc(s, m * sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
695 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
696 s[n++] = strdup(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
697 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
698 ks_destroy(ks);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
699 #if KS_BGZF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
700 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
701 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
702 gzclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
703 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
704 s = (char**)realloc(s, n * sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
705 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
706 } else if (*fn == ':') { // read from string
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
707 const char *q, *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
708 for (q = p = fn + 1;; ++p)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
709 if (*p == ',' || *p == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
710 if (m == n) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
711 m = m? m<<1 : 16;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
712 s = (char**)realloc(s, m * sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
713 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
714 s[n] = (char*)calloc(p - q + 1, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
715 strncpy(s[n++], q, p - q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
716 q = p + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
717 if (*p == 0) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
718 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
719 } else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
720 s = (char**)realloc(s, n * sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
721 *_n = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
722 return s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
723 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
724
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
725 // DEPRECATED: To be removed in a future HTSlib release
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
726 int hts_file_type(const char *fname)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
727 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
728 int len = strlen(fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
729 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
730 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
731 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
732 if ( !strcmp("-",fname) ) return FT_STDIN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
733
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
734 hFILE *f = hopen(fname, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
735 if (f == NULL) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
736
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
737 htsFormat fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
738 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
739 if (hclose(f) < 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
740
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
741 switch (fmt.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
742 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
743 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
744 default: return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
745 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
746 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
747
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
748 /****************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
749 *** Indexing ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
750 ****************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
751
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
752 #define HTS_MIN_MARKER_DIST 0x10000
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
753
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
754 // Finds the special meta bin
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
755 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
756 #define META_BIN(idx) ((idx)->n_bins + 1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
757
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
758 #define pair64_lt(a,b) ((a).u < (b).u)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
759
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
760 #include "htslib/ksort.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
761 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
762
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
763 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
764 int32_t m, n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
765 uint64_t loff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
766 hts_pair64_t *list;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
767 } bins_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
768
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
769 #include "htslib/khash.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
770 KHASH_MAP_INIT_INT(bin, bins_t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
771 typedef khash_t(bin) bidx_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
772
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
773 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
774 int32_t n, m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
775 uint64_t *offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
776 } lidx_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
777
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
778 struct __hts_idx_t {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
779 int fmt, min_shift, n_lvls, n_bins;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
780 uint32_t l_meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
781 int32_t n, m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
782 uint64_t n_no_coor;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
783 bidx_t **bidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
784 lidx_t *lidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
785 uint8_t *meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
786 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
787 uint32_t last_bin, save_bin;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
788 int last_coor, last_tid, save_tid, finished;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
789 uint64_t last_off, save_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
790 uint64_t off_beg, off_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
791 uint64_t n_mapped, n_unmapped;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
792 } z; // keep internal states
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
793 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
794
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
795 static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
796 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
797 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
798 bins_t *l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
799 int absent;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
800 k = kh_put(bin, b, bin, &absent);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
801 l = &kh_value(b, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
802 if (absent) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
803 l->m = 1; l->n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
804 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
805 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
806 if (l->n == l->m) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
807 l->m <<= 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
808 l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
809 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
810 l->list[l->n].u = beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
811 l->list[l->n++].v = end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
812 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
813
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
814 static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
815 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
816 int i, beg, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
817 beg = _beg >> min_shift;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
818 end = (_end - 1) >> min_shift;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
819 if (l->m < end + 1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
820 int old_m = l->m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
821 l->m = end + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
822 kroundup32(l->m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
823 l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
824 memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
825 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
826 if (beg == end) { // to save a loop in this case
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
827 if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
828 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
829 for (i = beg; i <= end; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
830 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
831 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
832 if (l->n < end + 1) l->n = end + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
833 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
834
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
835 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
836 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
837 hts_idx_t *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
838 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
839 if (idx == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
840 idx->fmt = fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
841 idx->min_shift = min_shift;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
842 idx->n_lvls = n_lvls;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
843 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
844 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
845 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
846 idx->z.last_coor = 0xffffffffu;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
847 if (n) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
848 idx->n = idx->m = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
849 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
850 if (idx->bidx == NULL) { free(idx); return NULL; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
851 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
852 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
853 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
854 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
855 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
856
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
857 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
858 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
859 bidx_t *bidx = idx->bidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
860 lidx_t *lidx = &idx->lidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
861 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
862 int l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
863 uint64_t offset0 = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
864 if (bidx) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
865 k = kh_get(bin, bidx, META_BIN(idx));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
866 if (k != kh_end(bidx))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
867 offset0 = kh_val(bidx, k).list[0].u;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
868 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
869 lidx->offset[l] = offset0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
870 } else l = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
871 for (; l < lidx->n; ++l) // fill missing values
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
872 if (lidx->offset[l] == (uint64_t)-1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
873 lidx->offset[l] = lidx->offset[l-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
874 if (bidx == 0) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
875 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
876 if (kh_exist(bidx, k))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
877 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
878 if ( kh_key(bidx, k) < idx->n_bins )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
879 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
880 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
881 // disable linear index if bot_bin out of bounds
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
882 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
883 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
884 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
885 kh_val(bidx, k).loff = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
886 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
887 if (free_lidx) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
888 free(lidx->offset);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
889 lidx->m = lidx->n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
890 lidx->offset = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
891 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
892 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
893
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
894 static void compress_binning(hts_idx_t *idx, int i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
895 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
896 bidx_t *bidx = idx->bidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
897 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
898 int l, m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
899 if (bidx == 0) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
900 // merge a bin to its parent if the bin is too small
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
901 for (l = idx->n_lvls; l > 0; --l) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
902 unsigned start = hts_bin_first(l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
903 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
904 bins_t *p, *q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
905 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
906 p = &kh_value(bidx, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
907 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
908 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
909 khint_t kp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
910 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
911 if (kp == kh_end(bidx)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
912 q = &kh_val(bidx, kp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
913 if (q->n + p->n > q->m) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
914 q->m = q->n + p->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
915 kroundup32(q->m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
916 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
917 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
918 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
919 q->n += p->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
920 free(p->list);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
921 kh_del(bin, bidx, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
922 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
923 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
924 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
925 k = kh_get(bin, bidx, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
926 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
927 // merge adjacent chunks that start from the same BGZF block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
928 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
929 bins_t *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
930 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
931 p = &kh_value(bidx, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
932 for (l = 1, m = 0; l < p->n; ++l) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
933 if (p->list[m].v>>16 >= p->list[l].u>>16) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
934 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
935 } else p->list[++m] = p->list[l];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
936 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
937 p->n = m + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
938 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
939 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
940
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
941 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
942 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
943 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
944 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
945 if (idx->z.save_tid >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
946 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
947 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
948 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
949 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
950 for (i = 0; i < idx->n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
951 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
952 compress_binning(idx, i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
953 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
954 idx->z.finished = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
955 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
956
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
957 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
958 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
959 int bin;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
960 if (tid<0) beg = -1, end = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
961 if (tid >= idx->m) { // enlarge the index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
962 int32_t oldm = idx->m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
963 idx->m = idx->m? idx->m<<1 : 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
964 idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
965 idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
966 memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
967 memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
968 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
969 if (idx->n < tid + 1) idx->n = tid + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
970 if (idx->z.finished) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
971 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
972 if ( tid>=0 && idx->n_no_coor )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
973 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
974 if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
975 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
976 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
977 if (tid>=0 && idx->bidx[tid] != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
978 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
979 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
980 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
981 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
982 idx->z.last_tid = tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
983 idx->z.last_bin = 0xffffffffu;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
984 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
985 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
986 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
987 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
988 if ( tid>=0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
989 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
990 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
991 if ( is_mapped)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
992 insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
993 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
994 else idx->n_no_coor++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
995 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
996 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
997 if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
998 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
999 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1000 idx->z.off_end = idx->z.last_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1001 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1002 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1003 idx->z.n_mapped = idx->z.n_unmapped = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1004 idx->z.off_beg = idx->z.off_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1005 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1006 idx->z.save_off = idx->z.last_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1007 idx->z.save_bin = idx->z.last_bin = bin;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1008 idx->z.save_tid = tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1009 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1010 if (is_mapped) ++idx->z.n_mapped;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1011 else ++idx->z.n_unmapped;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1012 idx->z.last_off = offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1013 idx->z.last_coor = beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1014 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1015 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1016
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1017 void hts_idx_destroy(hts_idx_t *idx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1018 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1019 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1020 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1021 if (idx == 0) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1022 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1023 if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1024
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1025 for (i = 0; i < idx->m; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1026 bidx_t *bidx = idx->bidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1027 free(idx->lidx[i].offset);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1028 if (bidx == 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1029 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1030 if (kh_exist(bidx, k))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1031 free(kh_value(bidx, k).list);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1032 kh_destroy(bin, bidx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1033 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1034 free(idx->bidx); free(idx->lidx); free(idx->meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1035 free(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1036 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1037
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1038 static inline long idx_read(int is_bgzf, void *fp, void *buf, long l)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1039 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1040 if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1041 else return (long)fread(buf, 1, l, (FILE*)fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1042 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1043
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1044 static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1045 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1046 if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1047 else return (long)fwrite(buf, 1, l, (FILE*)fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1048 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1049
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1050 static inline void swap_bins(bins_t *p)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1051 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1052 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1053 for (i = 0; i < p->n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1054 ed_swap_8p(&p->list[i].u);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1055 ed_swap_8p(&p->list[i].v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1056 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1057 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1058
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1059 static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1060 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1061 int32_t i, size, is_be;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1062 int is_bgzf = (fmt != HTS_FMT_BAI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1063 is_be = ed_is_big();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1064 if (is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1065 uint32_t x = idx->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1066 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1067 } else idx_write(is_bgzf, fp, &idx->n, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1068 if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1069 for (i = 0; i < idx->n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1070 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1071 bidx_t *bidx = idx->bidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1072 lidx_t *lidx = &idx->lidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1073 // write binning index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1074 size = bidx? kh_size(bidx) : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1075 if (is_be) { // big endian
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1076 uint32_t x = size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1077 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1078 } else idx_write(is_bgzf, fp, &size, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1079 if (bidx == 0) goto write_lidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1080 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1081 bins_t *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1082 if (!kh_exist(bidx, k)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1083 p = &kh_value(bidx, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1084 if (is_be) { // big endian
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1085 uint32_t x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1086 x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1087 if (fmt == HTS_FMT_CSI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1088 uint64_t y = kh_val(bidx, k).loff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1089 idx_write(is_bgzf, fp, ed_swap_4p(&y), 8);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1090 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1091 x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1092 swap_bins(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1093 idx_write(is_bgzf, fp, p->list, 16 * p->n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1094 swap_bins(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1095 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1096 idx_write(is_bgzf, fp, &kh_key(bidx, k), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1097 if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1098 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1099 idx_write(is_bgzf, fp, &p->n, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1100 idx_write(is_bgzf, fp, p->list, p->n << 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1101 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1102 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1103 write_lidx:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1104 if (fmt != HTS_FMT_CSI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1105 if (is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1106 int32_t x = lidx->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1107 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1108 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1109 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1110 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1111 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1112 idx_write(is_bgzf, fp, &lidx->n, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1113 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1114 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1115 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1116 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1117 if (is_be) { // write the number of reads without coordinates
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1118 uint64_t x = idx->n_no_coor;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1119 idx_write(is_bgzf, fp, &x, 8);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1120 } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1121 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1122
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1123 void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1124 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1125 char *fnidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1126 fnidx = (char*)calloc(1, strlen(fn) + 5);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1127 strcpy(fnidx, fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1128 if (fmt == HTS_FMT_CSI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1129 BGZF *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1130 uint32_t x[3];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1131 int is_be, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1132 is_be = ed_is_big();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1133 fp = bgzf_open(strcat(fnidx, ".csi"), "w");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1134 bgzf_write(fp, "CSI\1", 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1135 x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1136 if (is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1137 for (i = 0; i < 3; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1138 bgzf_write(fp, ed_swap_4p(&x[i]), 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1139 } else bgzf_write(fp, &x, 12);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1140 if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1141 hts_idx_save_core(idx, fp, HTS_FMT_CSI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1142 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1143 } else if (fmt == HTS_FMT_TBI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1144 BGZF *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1145 fp = bgzf_open(strcat(fnidx, ".tbi"), "w");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1146 bgzf_write(fp, "TBI\1", 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1147 hts_idx_save_core(idx, fp, HTS_FMT_TBI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1148 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1149 } else if (fmt == HTS_FMT_BAI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1150 FILE *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1151 fp = fopen(strcat(fnidx, ".bai"), "w");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1152 fwrite("BAI\1", 1, 4, fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1153 hts_idx_save_core(idx, fp, HTS_FMT_BAI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1154 fclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1155 } else abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1156 free(fnidx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1157 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1158
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1159 static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1160 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1161 int32_t i, n, is_be;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1162 int is_bgzf = (fmt != HTS_FMT_BAI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1163 is_be = ed_is_big();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1164 if (idx == NULL) return -4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1165 for (i = 0; i < idx->n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1166 bidx_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1167 lidx_t *l = &idx->lidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1168 uint32_t key;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1169 int j, absent;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1170 bins_t *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1171 h = idx->bidx[i] = kh_init(bin);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1172 if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1173 if (is_be) ed_swap_4p(&n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1174 for (j = 0; j < n; ++j) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1175 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1176 if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1177 if (is_be) ed_swap_4p(&key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1178 k = kh_put(bin, h, key, &absent);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1179 if (absent <= 0) return -3; // Duplicate bin number
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1180 p = &kh_val(h, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1181 if (fmt == HTS_FMT_CSI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1182 if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1183 if (is_be) ed_swap_8p(&p->loff);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1184 } else p->loff = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1185 if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1186 if (is_be) ed_swap_4p(&p->n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1187 p->m = p->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1188 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1189 if (p->list == NULL) return -2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1190 if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1191 if (is_be) swap_bins(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1192 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1193 if (fmt != HTS_FMT_CSI) { // load linear index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1194 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1195 if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1196 if (is_be) ed_swap_4p(&l->n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1197 l->m = l->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1198 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1199 if (l->offset == NULL) return -2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1200 if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1201 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1202 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1203 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1204 update_loff(idx, i, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1205 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1206 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1207 if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1208 if (is_be) ed_swap_8p(&idx->n_no_coor);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1209 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1210 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1211
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1212 hts_idx_t *hts_idx_load_local(const char *fn, int fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1213 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1214 uint8_t magic[4];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1215 int i, is_be;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1216 hts_idx_t *idx = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1217 is_be = ed_is_big();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1218 if (fmt == HTS_FMT_CSI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1219 BGZF *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1220 uint32_t x[3], n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1221 uint8_t *meta = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1222 if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1223 if (bgzf_read(fp, magic, 4) != 4) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1224 if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1225 if (bgzf_read(fp, x, 12) != 12) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1226 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1227 if (x[2]) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1228 if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1229 if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1230 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1231 if (bgzf_read(fp, &n, 4) != 4) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1232 if (is_be) ed_swap_4p(&n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1233 if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1234 idx->l_meta = x[2];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1235 idx->meta = meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1236 meta = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1237 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1238 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1239 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1240
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1241 csi_fail:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1242 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1243 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1244 free(meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1245 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1246
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1247 } else if (fmt == HTS_FMT_TBI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1248 BGZF *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1249 uint32_t x[8];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1250 if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1251 if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1252 if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1253 if (bgzf_read(fp, x, 32) != 32) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1254 if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1255 if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1256 idx->l_meta = 28 + x[7];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1257 if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1258 memcpy(idx->meta, &x[1], 28);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1259 if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1260 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1261 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1262 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1263
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1264 tbi_fail:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1265 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1266 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1267 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1268
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1269 } else if (fmt == HTS_FMT_BAI) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1270 uint32_t n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1271 FILE *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1272 if ((fp = fopen(fn, "rb")) == 0) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1273 if (fread(magic, 1, 4, fp) != 4) goto bai_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1274 if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1275 if (fread(&n, 4, 1, fp) != 1) goto bai_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1276 if (is_be) ed_swap_4p(&n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1277 idx = hts_idx_init(n, fmt, 0, 14, 5);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1278 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1279 fclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1280 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1281
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1282 bai_fail:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1283 fclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1284 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1285 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1286
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1287 } else abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1288 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1289
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1290 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1291 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1292 if (idx->meta) free(idx->meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1293 idx->l_meta = l_meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1294 if (is_copy) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1295 idx->meta = (uint8_t*)malloc(l_meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1296 memcpy(idx->meta, meta, l_meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1297 } else idx->meta = meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1298 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1299
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1300 uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1301 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1302 *l_meta = idx->l_meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1303 return idx->meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1304 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1305
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1306 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1307 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1308 if ( !idx->n )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1309 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1310 *n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1311 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1312 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1313
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1314 int tid = 0, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1315 const char **names = (const char**) calloc(idx->n,sizeof(const char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1316 for (i=0; i<idx->n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1317 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1318 bidx_t *bidx = idx->bidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1319 if ( !bidx ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1320 names[tid++] = getid(hdr,i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1321 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1322 *n = tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1323 return names;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1324 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1325
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1326 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1327 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1328 if ( idx->fmt == HTS_FMT_CRAI ) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1329 *mapped = 0; *unmapped = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1330 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1331 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1332
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1333 bidx_t *h = idx->bidx[tid];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1334 khint_t k = kh_get(bin, h, META_BIN(idx));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1335 if (k != kh_end(h)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1336 *mapped = kh_val(h, k).list[1].u;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1337 *unmapped = kh_val(h, k).list[1].v;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1338 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1339 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1340 *mapped = 0; *unmapped = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1341 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1342 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1343 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1344
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1345 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1346 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1347 return idx->n_no_coor;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1348 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1349
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1350 /****************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1351 *** Iterator ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1352 ****************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1353
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1354 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1355 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1356 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1357 if (beg >= end) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1358 if (end >= 1LL<<s) end = 1LL<<s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1359 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1360 int b, e, n, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1361 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1362 if (itr->bins.n + n > itr->bins.m) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1363 itr->bins.m = itr->bins.n + n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1364 kroundup32(itr->bins.m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1365 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1366 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1367 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1368 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1369 return itr->bins.n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1370 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1371
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1372 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1373 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1374 int i, n_off, l, bin;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1375 hts_pair64_t *off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1376 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1377 bidx_t *bidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1378 uint64_t min_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1379 hts_itr_t *iter = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1380 if (tid < 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1381 int finished0 = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1382 uint64_t off0 = (uint64_t)-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1383 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1384 switch (tid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1385 case HTS_IDX_START:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1386 // Find the smallest offset, note that sequence ids may not be ordered sequentially
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1387 for (i=0; i<idx->n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1388 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1389 bidx = idx->bidx[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1390 k = kh_get(bin, bidx, META_BIN(idx));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1391 if (k == kh_end(bidx)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1392 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1393 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1394 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1395 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1396
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1397 case HTS_IDX_NOCOOR:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1398 if ( idx->n>0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1399 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1400 bidx = idx->bidx[idx->n - 1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1401 k = kh_get(bin, bidx, META_BIN(idx));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1402 if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1403 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1404 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1405 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1406
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1407 case HTS_IDX_REST:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1408 off0 = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1409 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1410
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1411 case HTS_IDX_NONE:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1412 finished0 = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1413 off0 = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1414 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1415
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1416 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1417 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1418 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1419 if (off0 != (uint64_t)-1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1420 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1421 iter->read_rest = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1422 iter->finished = finished0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1423 iter->curr_off = off0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1424 iter->readrec = readrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1425 return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1426 } else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1427 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1428
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1429 if (beg < 0) beg = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1430 if (end < beg) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1431 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1432
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1433 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1434 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1435 iter->readrec = readrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1436
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1437 // compute min_off
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1438 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1439 do {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1440 int first;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1441 k = kh_get(bin, bidx, bin);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1442 if (k != kh_end(bidx)) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1443 first = (hts_bin_parent(bin)<<3) + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1444 if (bin > first) --bin;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1445 else bin = hts_bin_parent(bin);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1446 } while (bin);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1447 if (bin == 0) k = kh_get(bin, bidx, bin);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1448 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1449 // retrieve bins
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1450 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1451 for (i = n_off = 0; i < iter->bins.n; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1452 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1453 n_off += kh_value(bidx, k).n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1454 if (n_off == 0) return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1455 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1456 for (i = n_off = 0; i < iter->bins.n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1457 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1458 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1459 bins_t *p = &kh_value(bidx, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1460 for (j = 0; j < p->n; ++j)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1461 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1462 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1463 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1464 if (n_off == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1465 free(off); return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1466 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1467 ks_introsort(_off, n_off, off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1468 // resolve completely contained adjacent blocks
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1469 for (i = 1, l = 0; i < n_off; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1470 if (off[l].v < off[i].v) off[++l] = off[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1471 n_off = l + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1472 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1473 for (i = 1; i < n_off; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1474 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1475 // merge adjacent blocks
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1476 for (i = 1, l = 0; i < n_off; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1477 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1478 else off[++l] = off[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1479 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1480 n_off = l + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1481 iter->n_off = n_off; iter->off = off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1482 return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1483 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1484
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1485 void hts_itr_destroy(hts_itr_t *iter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1486 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1487 if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1488 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1489
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1490 const char *hts_parse_reg(const char *s, int *beg, int *end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1491 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1492 int i, k, l, name_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1493 *beg = *end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1494 name_end = l = strlen(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1495 // determine the sequence name
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1496 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1497 if (i >= 0) name_end = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1498 if (name_end < l) { // check if this is really the end
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1499 int n_hyphen = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1500 for (i = name_end + 1; i < l; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1501 if (s[i] == '-') ++n_hyphen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1502 else if (!isdigit(s[i]) && s[i] != ',') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1503 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1504 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1505 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1506 // parse the interval
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1507 if (name_end < l) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1508 char *tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1509 tmp = (char*)alloca(l - name_end + 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1510 for (i = name_end + 1, k = 0; i < l; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1511 if (s[i] != ',') tmp[k++] = s[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1512 tmp[k] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1513 if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1514 *end = *tmp? strtol(tmp + 1, &tmp, 10) : INT_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1515 if (*beg > *end) name_end = l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1516 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1517 if (name_end == l) *beg = 0, *end = INT_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1518 return s + name_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1519 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1520
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1521 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1522 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1523 int tid, beg, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1524 char *q, *tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1525 if (strcmp(reg, ".") == 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1526 return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1527 else if (strcmp(reg, "*") != 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1528 q = (char*)hts_parse_reg(reg, &beg, &end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1529 tmp = (char*)alloca(q - reg + 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1530 strncpy(tmp, reg, q - reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1531 tmp[q - reg] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1532 if ((tid = getid(hdr, tmp)) < 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1533 tid = getid(hdr, reg);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1534 if (tid < 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1535 return itr_query(idx, tid, beg, end, readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1536 } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1537 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1538
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1539 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1540 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1541 int ret, tid, beg, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1542 if (iter == NULL || iter->finished) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1543 if (iter->read_rest) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1544 if (iter->curr_off) { // seek to the start
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1545 bgzf_seek(fp, iter->curr_off, SEEK_SET);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1546 iter->curr_off = 0; // only seek once
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1547 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1548 ret = iter->readrec(fp, data, r, &tid, &beg, &end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1549 if (ret < 0) iter->finished = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1550 iter->curr_tid = tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1551 iter->curr_beg = beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1552 iter->curr_end = end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1553 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1554 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1555 if (iter->off == 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1556 for (;;) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1557 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1558 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1559 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1560 bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1561 iter->curr_off = bgzf_tell(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1562 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1563 ++iter->i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1564 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1565 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1566 iter->curr_off = bgzf_tell(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1567 if (tid != iter->tid || beg >= iter->end) { // no need to proceed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1568 ret = -1; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1569 } else if (end > iter->beg && iter->end > beg) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1570 iter->curr_tid = tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1571 iter->curr_beg = beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1572 iter->curr_end = end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1573 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1574 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1575 } else break; // end of file or error
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1576 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1577 iter->finished = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1578 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1579 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1580
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1581 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1582 *** Retrieve index ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1583 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1584
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1585 static char *test_and_fetch(const char *fn)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1586 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1587 FILE *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1588 if (hisremote(fn)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1589 const int buf_size = 1 * 1024 * 1024;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1590 hFILE *fp_remote;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1591 uint8_t *buf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1592 int l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1593 const char *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1594 for (p = fn + strlen(fn) - 1; p >= fn; --p)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1595 if (*p == '/') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1596 ++p; // p now points to the local file name
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1597 // Attempt to open local file first
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1598 if ((fp = fopen((char*)p, "rb")) != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1599 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1600 fclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1601 return (char*)p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1602 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1603 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1604 if ((fp_remote = hopen(fn, "r")) == 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1605 if ((fp = fopen(p, "w")) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1606 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1607 hclose_abruptly(fp_remote);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1608 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1609 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1610 if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1611 buf = (uint8_t*)calloc(buf_size, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1612 while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1613 free(buf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1614 fclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1615 if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1616 return (char*)p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1617 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1618 if ((fp = fopen(fn, "rb")) == 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1619 fclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1620 return (char*)fn;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1621 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1622 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1623
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1624 char *hts_idx_getfn(const char *fn, const char *ext)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1625 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1626 int i, l_fn, l_ext;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1627 char *fnidx, *ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1628 l_fn = strlen(fn); l_ext = strlen(ext);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1629 fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1630 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1631 if ((ret = test_and_fetch(fnidx)) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1632 for (i = l_fn - 1; i > 0; --i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1633 if (fnidx[i] == '.') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1634 strcpy(fnidx + i, ext);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1635 ret = test_and_fetch(fnidx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1636 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1637 if (ret == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1638 free(fnidx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1639 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1640 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1641 l_fn = strlen(ret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1642 memmove(fnidx, ret, l_fn + 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1643 return fnidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1644 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1645
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1646 hts_idx_t *hts_idx_load(const char *fn, int fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1647 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1648 char *fnidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1649 hts_idx_t *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1650 fnidx = hts_idx_getfn(fn, ".csi");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1651 if (fnidx) fmt = HTS_FMT_CSI;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1652 else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1653 if (fnidx == 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1654
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1655 // Check that the index file is up to date, the main file might have changed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1656 struct stat stat_idx,stat_main;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1657 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1658 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1659 if ( stat_idx.st_mtime < stat_main.st_mtime )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1660 fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1661 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1662 idx = hts_idx_load_local(fnidx, fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1663 free(fnidx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1664 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1665 }