annotate ezBAMQC/src/htslib/cram/cram_index.c @ 20:9de3bbec2479 draft default tip

Uploaded
author youngkim
date Thu, 31 Mar 2016 10:10:37 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2 Copyright (c) 2013-2014 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Author: James Bonfield <jkb@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5 Redistribution and use in source and binary forms, with or without
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6 modification, are permitted provided that the following conditions are met:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 1. Redistributions of source code must retain the above copyright notice,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 this list of conditions and the following disclaimer.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 2. Redistributions in binary form must reproduce the above copyright notice,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 this list of conditions and the following disclaimer in the documentation
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 and/or other materials provided with the distribution.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 Institute nor the names of its contributors may be used to endorse or promote
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17 products derived from this software without specific prior written permission.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 * The index is a gzipped tab-delimited text file with one line per slice.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 * The columns are:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 * 1: reference number (0 to N-1, as per BAM ref_id)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 * 2: reference position of 1st read in slice (1..?)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36 * 3: number of reads in slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 * 4: offset of container start (relative to end of SAM header, so 1st
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 * container is offset 0).
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 * 5: slice number within container (ie which landmark).
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 * In memory, we hold this in a nested containment list. Each list element is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 * a cram_index struct. Each element in turn can contain its own list of
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 * cram_index structs.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 * Any start..end range which is entirely contained within another (and
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 * earlier as it is sorted) range will be held within it. This ensures that
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 * the outer list will never have containments and we can safely do a
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 * binary search to find the first range which overlaps any given coordinate.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 #ifdef HAVE_CONFIG_H
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52 #include "io_lib_config.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 #include <stdio.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 #include <errno.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 #include <assert.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58 #include <stdlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 #include <string.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 #include <zlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 #include <sys/types.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 #include <sys/stat.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 #include <math.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 #include <ctype.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 #include "htslib/hfile.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 #include "cram/cram.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 #include "cram/os.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 #include "cram/zfio.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 #if 0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 static void dump_index_(cram_index *e, int level) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 int i, n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 for (i = 0; i < e->nslice; i++) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 dump_index_(&e->e[i], level+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 static void dump_index(cram_fd *fd) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 for (i = 0; i < fd->index_sz; i++) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 dump_index_(&fd->index[i], 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 int sign = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 int32_t val = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 size_t p = *pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 p++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 if (p < k->l && k->s[p] == '-')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 sign = -1, p++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 val = val*10 + k->s[p++]-'0';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 *pos = p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 *val_p = sign*val;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 int sign = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 int64_t val = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 size_t p = *pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 p++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 if (p < k->l && k->s[p] == '-')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 sign = -1, p++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 val = val*10 + k->s[p++]-'0';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 *pos = p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 *val_p = sign*val;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 * Loads a CRAM .crai index into memory.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 * Returns 0 for success
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 * -1 for failure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 int cram_index_load(cram_fd *fd, const char *fn) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 char fn2[PATH_MAX];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 char buf[65536];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 ssize_t len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 kstring_t kstr = {0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 hFILE *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 cram_index *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 cram_index **idx_stack = NULL, *ep, e;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149 int idx_stack_alloc = 0, idx_stack_ptr = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 size_t pos = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 /* Check if already loaded */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 if (fd->index)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 if (!fd->index)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 idx = &fd->index[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 idx->refid = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 idx->start = INT_MIN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 idx->end = INT_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166 idx_stack[idx_stack_ptr] = idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 sprintf(fn2, "%s.crai", fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 if (!(fp = hopen(fn2, "r"))) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 perror(fn2);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 free(idx_stack);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175 // Load the file into memory
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 while ((len = hread(fp, buf, 65536)) > 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 kputsn(buf, len, &kstr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 if (len < 0 || kstr.l < 2) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 if (kstr.s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 free(kstr.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 free(idx_stack);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 if (hclose(fp)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186 if (kstr.s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 free(kstr.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188 free(idx_stack);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 // Uncompress if required
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 size_t l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197 free(kstr.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 if (!s) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 free(idx_stack);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 kstr.s = s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 kstr.l = l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 kstr.m = l; // conservative estimate of the size allocated
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 // Parse it line at a time
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 do {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 /* 1.1 layout */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 if (kget_int32(&kstr, &pos, &e.refid) == -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 free(kstr.s); free(idx_stack); return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 if (kget_int32(&kstr, &pos, &e.start) == -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 free(kstr.s); free(idx_stack); return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218 if (kget_int32(&kstr, &pos, &e.end) == -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 free(kstr.s); free(idx_stack); return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 if (kget_int64(&kstr, &pos, &e.offset) == -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 free(kstr.s); free(idx_stack); return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 if (kget_int32(&kstr, &pos, &e.slice) == -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 free(kstr.s); free(idx_stack); return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 if (kget_int32(&kstr, &pos, &e.len) == -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 free(kstr.s); free(idx_stack); return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 e.end += e.start-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 if (e.refid < -1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 free(kstr.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 free(idx_stack);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241 if (e.refid != idx->refid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 if (fd->index_sz < e.refid+2) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 size_t index_end = fd->index_sz * sizeof(*fd->index);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 fd->index_sz = e.refid+2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245 fd->index = realloc(fd->index,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 fd->index_sz * sizeof(*fd->index));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 memset(((char *)fd->index) + index_end, 0,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 fd->index_sz * sizeof(*fd->index) - index_end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 idx = &fd->index[e.refid+1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251 idx->refid = e.refid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 idx->start = INT_MIN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 idx->end = INT_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 idx->nslice = idx->nalloc = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 idx->e = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 idx_stack[(idx_stack_ptr = 0)] = idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 while (!(e.start >= idx->start && e.end <= idx->end)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 idx = idx_stack[--idx_stack_ptr];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 // Now contains, so append
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 if (idx->nslice+1 >= idx->nalloc) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 e.nalloc = e.nslice = 0; e.e = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 *(ep = &idx->e[idx->nslice++]) = e;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 idx = ep;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 if (++idx_stack_ptr >= idx_stack_alloc) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 idx_stack_alloc *= 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 idx_stack[idx_stack_ptr] = idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279 while (pos < kstr.l && kstr.s[pos] != '\n')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 pos++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 pos++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 } while (pos < kstr.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 free(idx_stack);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 free(kstr.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 // dump_index(fd);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 static void cram_index_free_recurse(cram_index *e) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 if (e->e) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 for (i = 0; i < e->nslice; i++) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 cram_index_free_recurse(&e->e[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 free(e->e);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 void cram_index_free(cram_fd *fd) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 if (!fd->index)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306 return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 for (i = 0; i < fd->index_sz; i++) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 cram_index_free_recurse(&fd->index[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 free(fd->index);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 fd->index = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 * Searches the index for the first slice overlapping a reference ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 * and position, or one immediately preceding it if none is found in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 * the index to overlap this position. (Our index may have missing
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 * entries, but we require at least one per reference.)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 * If the index finds multiple slices overlapping this position we
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323 * return the first one only. Subsequent calls should specifying
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 * "from" as the last slice we checked to find the next one. Otherwise
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325 * set "from" to be NULL to find the first one.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 * Returns the cram_index pointer on sucess
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 * NULL on failure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 cram_index *from) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332 int i, j, k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 cram_index *e;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335 if (refid+1 < 0 || refid+1 >= fd->index_sz)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338 i = 0, j = fd->index[refid+1].nslice-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
339
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
340 if (!from)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
341 from = &fd->index[refid+1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
342
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
343 for (k = j/2; k != i; k = (j-i)/2 + i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
344 if (from->e[k].refid > refid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
345 j = k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
346 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
347 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
348
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
349 if (from->e[k].refid < refid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
350 i = k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
351 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
352 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
353
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
354 if (from->e[k].start >= pos) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
355 j = k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
356 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
357 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
358
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
359 if (from->e[k].start < pos) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
360 i = k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
361 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
362 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
363 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
364 // i==j or i==j-1. Check if j is better.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
365 if (from->e[j].start < pos && from->e[j].refid == refid)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
366 i = j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
367
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
368 /* The above found *a* bin overlapping, but not necessarily the first */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
369 while (i > 0 && from->e[i-1].end >= pos)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
370 i--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
371
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
372 /* Special case for matching a start pos */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
373 if (i+1 < from->nslice &&
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
374 from->e[i+1].start == pos &&
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
375 from->e[i+1].refid == refid)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
376 i++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
377
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
378 e = &from->e[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
379
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
380 return e;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
381 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
382
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
383
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
384 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
385 * Skips to a container overlapping the start coordinate listed in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
386 * cram_range.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
387 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
388 * In theory we call cram_index_query multiple times, once per slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
389 * overlapping the range. However slices may be absent from the index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
390 * which makes this problematic. Instead we find the left-most slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
391 * and then read from then on, skipping decoding of slices and/or
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
392 * whole containers when they don't overlap the specified cram_range.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
393 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
394 * Returns 0 on success
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
395 * -1 on failure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
396 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
397 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
398 cram_index *e;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
399
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
400 // Ideally use an index, so see if we have one.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
401 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
402 if (0 != cram_seek(fd, e->offset, SEEK_SET))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
403 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
404 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
405 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
406 fprintf(stderr, "Unknown reference ID. Missing from index?\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
407 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
408 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
409
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
410 if (fd->ctr) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
411 cram_free_container(fd->ctr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
412 fd->ctr = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
413 fd->ooc = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
414 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
415
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
416 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
417 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
418
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
419
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
420 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
421 * A specialised form of cram_index_build (below) that deals with slices
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
422 * having multiple references in this (ref_id -2). In this scenario we
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
423 * decode the slice to look at the RI data series instead.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
424 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
425 * Returns 0 on success
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
426 * -1 on failure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
427 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
428 static int cram_index_build_multiref(cram_fd *fd,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
429 cram_container *c,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
430 cram_slice *s,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
431 zfp *fp,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
432 off_t cpos,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
433 int32_t landmark,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
434 int sz) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
435 int i, ref = -2, ref_start = 0, ref_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
436 char buf[1024];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
437
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
438 if (0 != cram_decode_slice(fd, c, s, fd->header))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
439 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
440
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
441 ref_end = INT_MIN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
442 for (i = 0; i < s->hdr->num_records; i++) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
443 if (s->crecs[i].ref_id == ref) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
444 if (ref_end < s->crecs[i].aend)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
445 ref_end = s->crecs[i].aend;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
446 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
447 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
448
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
449 if (ref != -2) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
450 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
451 ref, ref_start, ref_end - ref_start + 1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
452 (int64_t)cpos, landmark, sz);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
453 zfputs(buf, fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
454 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
455
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
456 ref = s->crecs[i].ref_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
457 ref_start = s->crecs[i].apos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
458 ref_end = INT_MIN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
459 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
460
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
461 if (ref != -2) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
462 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
463 ref, ref_start, ref_end - ref_start + 1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
464 (int64_t)cpos, landmark, sz);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
465 zfputs(buf, fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
466 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
467
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
468 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
469 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
470
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
471 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
472 * Builds an index file.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
473 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
474 * fd is a newly opened cram file that we wish to index.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
475 * fn_base is the filename of the associated CRAM file. Internally we
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
476 * add ".crai" to this to get the index filename.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
477 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
478 * Returns 0 on success
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
479 * -1 on failure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
480 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
481 int cram_index_build(cram_fd *fd, const char *fn_base) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
482 cram_container *c;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
483 off_t cpos, spos, hpos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
484 zfp *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
485 char fn_idx[PATH_MAX];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
486
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
487 if (strlen(fn_base) > PATH_MAX-6)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
488 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
489
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
490 sprintf(fn_idx, "%s.crai", fn_base);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
491 if (!(fp = zfopen(fn_idx, "wz"))) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
492 perror(fn_idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
493 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
494 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
495
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
496 cpos = htell(fd->fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
497 while ((c = cram_read_container(fd))) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
498 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
499
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
500 if (fd->err) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
501 perror("Cram container read");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
502 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
503 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
504
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
505 hpos = htell(fd->fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
506
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
507 if (!(c->comp_hdr_block = cram_read_block(fd)))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
508 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
509 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
510
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
511 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
512 if (!c->comp_hdr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
513 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
514
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
515 // 2.0 format
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
516 for (j = 0; j < c->num_landmarks; j++) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
517 char buf[1024];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
518 cram_slice *s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
519 int sz;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
520
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
521 spos = htell(fd->fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
522 assert(spos - cpos - c->offset == c->landmark[j]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
523
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
524 if (!(s = cram_read_slice(fd))) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
525 zfclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
526 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
527 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
528
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
529 sz = (int)(htell(fd->fp) - spos);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
530
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
531 if (s->hdr->ref_seq_id == -2) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
532 cram_index_build_multiref(fd, c, s, fp,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
533 cpos, c->landmark[j], sz);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
534 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
535 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
536 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
537 s->hdr->ref_seq_span, (int64_t)cpos,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
538 c->landmark[j], sz);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
539 zfputs(buf, fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
540 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
541
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
542 cram_free_slice(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
543 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
544
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
545 cpos = htell(fd->fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
546 assert(cpos == hpos + c->length);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
547
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
548 cram_free_container(c);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
549 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
550 if (fd->err) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
551 zfclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
552 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
553 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
554
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
555
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
556 return zfclose(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
557 }