Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/regidx.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (C) 2014 Genome Research Ltd. | |
3 | |
4 Author: Petr Danecek <pd3@sanger.ac.uk> | |
5 | |
6 Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 of this software and associated documentation files (the "Software"), to deal | |
8 in the Software without restriction, including without limitation the rights | |
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 copies of the Software, and to permit persons to whom the Software is | |
11 furnished to do so, subject to the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be included in | |
14 all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
22 THE SOFTWARE. | |
23 */ | |
24 | |
25 #include "htslib/hts.h" | |
26 #include "htslib/kstring.h" | |
27 #include "htslib/kseq.h" | |
28 #include "htslib/khash_str2int.h" | |
29 #include "htslib/regidx.h" | |
30 | |
31 #define LIDX_SHIFT 13 // number of insignificant index bits | |
32 | |
33 // List of regions for one chromosome | |
34 typedef struct | |
35 { | |
36 int *idx, nidx; | |
37 int nregs, mregs; // n:used, m:alloced | |
38 reg_t *regs; | |
39 void *payload; | |
40 } | |
41 reglist_t; | |
42 | |
43 // Container of all sequences | |
44 struct _regidx_t | |
45 { | |
46 int nseq, mseq; // n:used, m:alloced | |
47 reglist_t *seq; // regions for each sequence | |
48 void *seq2regs; // hash for fast lookup from chr name to regions | |
49 char **seq_names; | |
50 regidx_free_f free; // function to free any data allocated by regidx_parse_f | |
51 regidx_parse_f parse; // parse one input line | |
52 void *usr; // user data to pass to regidx_parse_f | |
53 | |
54 // temporary data for index initialization | |
55 kstring_t str; | |
56 int rid_prev, start_prev, end_prev; | |
57 int payload_size; | |
58 void *payload; | |
59 }; | |
60 | |
61 int regidx_seq_nregs(regidx_t *idx, const char *seq) | |
62 { | |
63 int iseq; | |
64 if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence | |
65 return idx->seq[iseq].nregs; | |
66 } | |
67 | |
68 int regidx_nregs(regidx_t *idx) | |
69 { | |
70 int i, nregs = 0; | |
71 for (i=0; i<idx->nseq; i++) nregs += idx->seq[i].nregs; | |
72 return nregs; | |
73 } | |
74 | |
75 char **regidx_seq_names(regidx_t *idx, int *n) | |
76 { | |
77 *n = idx->nseq; | |
78 return idx->seq_names; | |
79 } | |
80 | |
81 int _regidx_build_index(regidx_t *idx) | |
82 { | |
83 int iseq; | |
84 for (iseq=0; iseq<idx->nseq; iseq++) | |
85 { | |
86 reglist_t *list = &idx->seq[iseq]; | |
87 int j,k, imax = 0; // max index bin | |
88 for (j=0; j<list->nregs; j++) | |
89 { | |
90 int ibeg = list->regs[j].start >> LIDX_SHIFT; | |
91 int iend = list->regs[j].end >> LIDX_SHIFT; | |
92 if ( imax < iend + 1 ) | |
93 { | |
94 int old_imax = imax; | |
95 imax = iend + 1; | |
96 kroundup32(imax); | |
97 list->idx = (int*) realloc(list->idx, imax*sizeof(int)); | |
98 for (k=old_imax; k<imax; k++) list->idx[k] = -1; | |
99 } | |
100 if ( ibeg==iend ) | |
101 { | |
102 if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j; | |
103 } | |
104 else | |
105 { | |
106 for (k=ibeg; k<=iend; k++) | |
107 if ( list->idx[k]<0 ) list->idx[k] = j; | |
108 } | |
109 list->nidx = iend + 1; | |
110 } | |
111 } | |
112 return 0; | |
113 } | |
114 | |
115 int regidx_insert(regidx_t *idx, char *line) | |
116 { | |
117 if ( !line ) | |
118 return _regidx_build_index(idx); | |
119 | |
120 char *chr_from, *chr_to; | |
121 reg_t reg; | |
122 int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr); | |
123 if ( ret==-2 ) return -1; // error | |
124 if ( ret==-1 ) return 0; // skip the line | |
125 | |
126 int rid; | |
127 idx->str.l = 0; | |
128 kputsn(chr_from, chr_to-chr_from+1, &idx->str); | |
129 if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) | |
130 { | |
131 idx->nseq++; | |
132 int m_prev = idx->mseq; | |
133 hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); | |
134 hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); | |
135 idx->seq_names[idx->nseq-1] = strdup(idx->str.s); | |
136 rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); | |
137 } | |
138 | |
139 reglist_t *list = &idx->seq[rid]; | |
140 list->nregs++; | |
141 int m_prev = list->mregs; | |
142 hts_expand(reg_t,list->nregs,list->mregs,list->regs); | |
143 list->regs[list->nregs-1] = reg; | |
144 if ( idx->payload_size ) | |
145 { | |
146 if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs); | |
147 memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size); | |
148 } | |
149 | |
150 if ( idx->rid_prev==rid ) | |
151 { | |
152 if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) ) | |
153 { | |
154 fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n", | |
155 idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1); | |
156 return -1; | |
157 } | |
158 } | |
159 idx->rid_prev = rid; | |
160 idx->start_prev = reg.start; | |
161 idx->end_prev = reg.end; | |
162 return 0; | |
163 } | |
164 | |
165 regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat) | |
166 { | |
167 if ( !parser ) | |
168 { | |
169 if ( !fname ) parser = regidx_parse_tab; | |
170 else | |
171 { | |
172 int len = strlen(fname); | |
173 if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) ) | |
174 parser = regidx_parse_bed; | |
175 else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) ) | |
176 parser = regidx_parse_bed; | |
177 else if ( len>=4 && !strcasecmp(".bed",fname+len-4) ) | |
178 parser = regidx_parse_bed; | |
179 else | |
180 parser = regidx_parse_tab; | |
181 } | |
182 } | |
183 | |
184 regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t)); | |
185 idx->free = free_f; | |
186 idx->parse = parser; | |
187 idx->usr = usr_dat; | |
188 idx->seq2regs = khash_str2int_init(); | |
189 idx->rid_prev = -1; | |
190 idx->start_prev = -1; | |
191 idx->end_prev = -1; | |
192 idx->payload_size = payload_size; | |
193 if ( payload_size ) idx->payload = malloc(payload_size); | |
194 | |
195 if ( !fname ) return idx; | |
196 | |
197 kstring_t str = {0,0,0}; | |
198 | |
199 htsFile *fp = hts_open(fname,"r"); | |
200 if ( !fp ) goto error; | |
201 | |
202 while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 ) | |
203 { | |
204 if ( regidx_insert(idx, str.s) ) goto error; | |
205 } | |
206 regidx_insert(idx, NULL); | |
207 | |
208 free(str.s); | |
209 hts_close(fp); | |
210 return idx; | |
211 | |
212 error: | |
213 free(str.s); | |
214 if ( fp ) hts_close(fp); | |
215 regidx_destroy(idx); | |
216 return NULL; | |
217 } | |
218 | |
219 void regidx_destroy(regidx_t *idx) | |
220 { | |
221 int i, j; | |
222 for (i=0; i<idx->nseq; i++) | |
223 { | |
224 reglist_t *list = &idx->seq[i]; | |
225 if ( idx->free ) | |
226 { | |
227 for (j=0; j<list->nregs; j++) | |
228 idx->free(list->payload + idx->payload_size*j); | |
229 } | |
230 free(list->payload); | |
231 free(list->regs); | |
232 free(list->idx); | |
233 } | |
234 free(idx->seq_names); | |
235 free(idx->seq); | |
236 free(idx->str.s); | |
237 free(idx->payload); | |
238 khash_str2int_destroy_free(idx->seq2regs); | |
239 free(idx); | |
240 } | |
241 | |
242 int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr) | |
243 { | |
244 if ( itr ) itr->i = itr->n = 0; | |
245 | |
246 int iseq; | |
247 if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence | |
248 | |
249 reglist_t *list = &idx->seq[iseq]; | |
250 if ( !list->nregs ) return 0; | |
251 | |
252 int i, ibeg = from>>LIDX_SHIFT; | |
253 int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ]; | |
254 if ( ireg < 0 ) | |
255 { | |
256 // linear search; if slow, replace with binary search | |
257 if ( ibeg > list->nidx ) ibeg = list->nidx; | |
258 for (i=ibeg - 1; i>=0; i--) | |
259 if ( list->idx[i] >=0 ) break; | |
260 ireg = i>=0 ? list->idx[i] : 0; | |
261 } | |
262 for (i=ireg; i<list->nregs; i++) | |
263 { | |
264 if ( list->regs[i].start > to ) return 0; // no match | |
265 if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found | |
266 } | |
267 | |
268 if ( i>=list->nregs ) return 0; // no match | |
269 | |
270 if ( !itr ) return 1; | |
271 | |
272 itr->i = 0; | |
273 itr->n = list->nregs - i; | |
274 itr->reg = &idx->seq[iseq].regs[i]; | |
275 if ( idx->payload_size ) | |
276 itr->payload = idx->seq[iseq].payload + i*idx->payload_size; | |
277 else | |
278 itr->payload = NULL; | |
279 | |
280 return 1; | |
281 } | |
282 | |
283 int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) | |
284 { | |
285 char *ss = (char*) line; | |
286 while ( *ss && isspace(*ss) ) ss++; | |
287 if ( !*ss ) return -1; // skip blank lines | |
288 if ( *ss=='#' ) return -1; // skip comments | |
289 | |
290 char *se = ss; | |
291 while ( *se && !isspace(*se) ) se++; | |
292 if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } | |
293 | |
294 *chr_beg = ss; | |
295 *chr_end = se-1; | |
296 | |
297 ss = se+1; | |
298 reg->start = strtol(ss, &se, 10); | |
299 if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } | |
300 | |
301 ss = se+1; | |
302 reg->end = strtol(ss, &se, 10) - 1; | |
303 if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } | |
304 | |
305 return 0; | |
306 } | |
307 | |
308 int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr) | |
309 { | |
310 char *ss = (char*) line; | |
311 while ( *ss && isspace(*ss) ) ss++; | |
312 if ( !*ss ) return -1; // skip blank lines | |
313 if ( *ss=='#' ) return -1; // skip comments | |
314 | |
315 char *se = ss; | |
316 while ( *se && !isspace(*se) ) se++; | |
317 if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } | |
318 | |
319 *chr_beg = ss; | |
320 *chr_end = se-1; | |
321 | |
322 ss = se+1; | |
323 reg->start = strtol(ss, &se, 10) - 1; | |
324 if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; } | |
325 | |
326 if ( !se[0] || !se[1] ) | |
327 reg->end = reg->start; | |
328 else | |
329 { | |
330 ss = se+1; | |
331 reg->end = strtol(ss, &se, 10); | |
332 if ( ss==se ) reg->end = reg->start; | |
333 else reg->end--; | |
334 } | |
335 | |
336 return 0; | |
337 } | |
338 |