annotate ezBAMQC/src/htslib/regidx.c @ 18:494b5cd02238

bash script
author youngkim
date Wed, 30 Mar 2016 13:39:05 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2 Copyright (C) 2014 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4 Author: Petr Danecek <pd3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 THE SOFTWARE.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25 #include "htslib/hts.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 #include "htslib/kstring.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 #include "htslib/kseq.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #include "htslib/khash_str2int.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #include "htslib/regidx.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #define LIDX_SHIFT 13 // number of insignificant index bits
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 // List of regions for one chromosome
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 typedef struct
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36 int *idx, nidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 int nregs, mregs; // n:used, m:alloced
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 reg_t *regs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 void *payload;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 reglist_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 // Container of all sequences
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 struct _regidx_t
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 int nseq, mseq; // n:used, m:alloced
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 reglist_t *seq; // regions for each sequence
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 void *seq2regs; // hash for fast lookup from chr name to regions
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 char **seq_names;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50 regidx_free_f free; // function to free any data allocated by regidx_parse_f
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 regidx_parse_f parse; // parse one input line
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52 void *usr; // user data to pass to regidx_parse_f
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54 // temporary data for index initialization
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 int rid_prev, start_prev, end_prev;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 int payload_size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58 void *payload;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 int regidx_seq_nregs(regidx_t *idx, const char *seq)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 int iseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65 return idx->seq[iseq].nregs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 int regidx_nregs(regidx_t *idx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70 int i, nregs = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 for (i=0; i<idx->nseq; i++) nregs += idx->seq[i].nregs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 return nregs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 char **regidx_seq_names(regidx_t *idx, int *n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 *n = idx->nseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 return idx->seq_names;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 int _regidx_build_index(regidx_t *idx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 int iseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 for (iseq=0; iseq<idx->nseq; iseq++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 reglist_t *list = &idx->seq[iseq];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 int j,k, imax = 0; // max index bin
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 for (j=0; j<list->nregs; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 int ibeg = list->regs[j].start >> LIDX_SHIFT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 int iend = list->regs[j].end >> LIDX_SHIFT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 if ( imax < iend + 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 int old_imax = imax;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 imax = iend + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 kroundup32(imax);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 list->idx = (int*) realloc(list->idx, imax*sizeof(int));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 for (k=old_imax; k<imax; k++) list->idx[k] = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 if ( ibeg==iend )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 for (k=ibeg; k<=iend; k++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 if ( list->idx[k]<0 ) list->idx[k] = j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 list->nidx = iend + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 int regidx_insert(regidx_t *idx, char *line)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 if ( !line )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 return _regidx_build_index(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 char *chr_from, *chr_to;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 reg_t reg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 int ret = idx->parse(line,&chr_from,&chr_to,&reg,idx->payload,idx->usr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 if ( ret==-2 ) return -1; // error
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 if ( ret==-1 ) return 0; // skip the line
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 int rid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 idx->str.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 kputsn(chr_from, chr_to-chr_from+1, &idx->str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131 idx->nseq++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 int m_prev = idx->mseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 reglist_t *list = &idx->seq[rid];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 list->nregs++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 int m_prev = list->mregs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 hts_expand(reg_t,list->nregs,list->mregs,list->regs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 list->regs[list->nregs-1] = reg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 if ( idx->payload_size )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 if ( m_prev < list->mregs ) list->payload = realloc(list->payload,idx->payload_size*list->mregs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 memcpy(list->payload + idx->payload_size*(list->nregs-1), idx->payload, idx->payload_size);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 if ( idx->rid_prev==rid )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n",
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159 idx->rid_prev = rid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 idx->start_prev = reg.start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 idx->end_prev = reg.end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 if ( !parser )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 if ( !fname ) parser = regidx_parse_tab;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 int len = strlen(fname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 parser = regidx_parse_bed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175 else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 parser = regidx_parse_bed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 parser = regidx_parse_bed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 parser = regidx_parse_tab;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 idx->free = free_f;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186 idx->parse = parser;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 idx->usr = usr_dat;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188 idx->seq2regs = khash_str2int_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 idx->rid_prev = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 idx->start_prev = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 idx->end_prev = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 idx->payload_size = payload_size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 if ( payload_size ) idx->payload = malloc(payload_size);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 if ( !fname ) return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 htsFile *fp = hts_open(fname,"r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 if ( !fp ) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 if ( regidx_insert(idx, str.s) ) goto error;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 regidx_insert(idx, NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 hts_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 error:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 if ( fp ) hts_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 regidx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 void regidx_destroy(regidx_t *idx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 int i, j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 for (i=0; i<idx->nseq; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 reglist_t *list = &idx->seq[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 if ( idx->free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 for (j=0; j<list->nregs; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 idx->free(list->payload + idx->payload_size*j);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 free(list->payload);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 free(list->regs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 free(list->idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 free(idx->seq_names);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 free(idx->seq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 free(idx->str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 free(idx->payload);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 khash_str2int_destroy_free(idx->seq2regs);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 free(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 if ( itr ) itr->i = itr->n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 int iseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 if ( khash_str2int_get(idx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 reglist_t *list = &idx->seq[iseq];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 if ( !list->nregs ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 int i, ibeg = from>>LIDX_SHIFT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 if ( ireg < 0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 // linear search; if slow, replace with binary search
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 if ( ibeg > list->nidx ) ibeg = list->nidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258 for (i=ibeg - 1; i>=0; i--)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 if ( list->idx[i] >=0 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 ireg = i>=0 ? list->idx[i] : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262 for (i=ireg; i<list->nregs; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 if ( list->regs[i].start > to ) return 0; // no match
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268 if ( i>=list->nregs ) return 0; // no match
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 if ( !itr ) return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 itr->i = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 itr->n = list->nregs - i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 itr->reg = &idx->seq[iseq].regs[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 if ( idx->payload_size )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 itr->payload = idx->seq[iseq].payload + i*idx->payload_size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 itr->payload = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 char *ss = (char*) line;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286 while ( *ss && isspace(*ss) ) ss++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 if ( !*ss ) return -1; // skip blank lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 if ( *ss=='#' ) return -1; // skip comments
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 char *se = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291 while ( *se && !isspace(*se) ) se++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 *chr_beg = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 *chr_end = se-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 ss = se+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 reg->start = strtol(ss, &se, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 ss = se+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 reg->end = strtol(ss, &se, 10) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 char *ss = (char*) line;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 while ( *ss && isspace(*ss) ) ss++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 if ( !*ss ) return -1; // skip blank lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 if ( *ss=='#' ) return -1; // skip comments
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315 char *se = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 while ( *se && !isspace(*se) ) se++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 *chr_beg = ss;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 *chr_end = se-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 ss = se+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323 reg->start = strtol(ss, &se, 10) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 if ( !se[0] || !se[1] )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 reg->end = reg->start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330 ss = se+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 reg->end = strtol(ss, &se, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332 if ( ss==se ) reg->end = reg->start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 else reg->end--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338