Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/synced_bcf_reader.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* synced_bcf_reader.c -- stream through multiple VCF files. | |
2 | |
3 Copyright (C) 2012-2014 Genome Research Ltd. | |
4 | |
5 Author: Petr Danecek <pd3@sanger.ac.uk> | |
6 | |
7 Permission is hereby granted, free of charge, to any person obtaining a copy | |
8 of this software and associated documentation files (the "Software"), to deal | |
9 in the Software without restriction, including without limitation the rights | |
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
11 copies of the Software, and to permit persons to whom the Software is | |
12 furnished to do so, subject to the following conditions: | |
13 | |
14 The above copyright notice and this permission notice shall be included in | |
15 all copies or substantial portions of the Software. | |
16 | |
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
23 DEALINGS IN THE SOFTWARE. */ | |
24 | |
25 #include <stdio.h> | |
26 #include <unistd.h> | |
27 #include <string.h> | |
28 #include <limits.h> | |
29 #include <errno.h> | |
30 #include <ctype.h> | |
31 #include <sys/stat.h> | |
32 #include "htslib/synced_bcf_reader.h" | |
33 #include "htslib/kseq.h" | |
34 #include "htslib/khash_str2int.h" | |
35 | |
36 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi | |
37 | |
38 typedef struct | |
39 { | |
40 uint32_t start, end; | |
41 } | |
42 region1_t; | |
43 | |
44 typedef struct _region_t | |
45 { | |
46 region1_t *regs; | |
47 int nregs, mregs, creg; | |
48 } | |
49 region_t; | |
50 | |
51 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end); | |
52 static bcf_sr_regions_t *_regions_init_string(const char *str); | |
53 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); | |
54 | |
55 char *bcf_sr_strerror(int errnum) | |
56 { | |
57 switch (errnum) | |
58 { | |
59 case open_failed: | |
60 return strerror(errno); break; | |
61 case not_bgzf: | |
62 return "not compressed with bgzip"; break; | |
63 case idx_load_failed: | |
64 return "could not load index"; break; | |
65 case file_type_error: | |
66 return "unknown file type"; break; | |
67 case api_usage_error: | |
68 return "API usage error"; break; | |
69 case header_error: | |
70 return "could not parse header"; break; | |
71 default: return ""; | |
72 } | |
73 } | |
74 | |
75 static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) | |
76 { | |
77 kstring_t str = {0,0,0}; | |
78 const char *tmp = filters, *prev = filters; | |
79 int nout = 0, *out = NULL; | |
80 while ( 1 ) | |
81 { | |
82 if ( *tmp==',' || !*tmp ) | |
83 { | |
84 out = (int*) realloc(out, (nout+1)*sizeof(int)); | |
85 if ( tmp-prev==1 && *prev=='.' ) | |
86 out[nout] = -1; | |
87 else | |
88 { | |
89 str.l = 0; | |
90 kputsn(prev, tmp-prev, &str); | |
91 out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s); | |
92 } | |
93 nout++; | |
94 if ( !*tmp ) break; | |
95 prev = tmp+1; | |
96 } | |
97 tmp++; | |
98 } | |
99 if ( str.m ) free(str.s); | |
100 *nfilters = nout; | |
101 return out; | |
102 } | |
103 | |
104 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) | |
105 { | |
106 assert( !readers->regions ); | |
107 if ( readers->nreaders ) | |
108 { | |
109 fprintf(stderr,"[%s:%d %s] Error: bcf_sr_set_regions() must be called before bcf_sr_add_reader()\n", __FILE__,__LINE__,__FUNCTION__); | |
110 return -1; | |
111 } | |
112 readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); | |
113 if ( !readers->regions ) return -1; | |
114 readers->explicit_regs = 1; | |
115 readers->require_index = 1; | |
116 return 0; | |
117 } | |
118 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles) | |
119 { | |
120 assert( !readers->targets ); | |
121 if ( targets[0]=='^' ) | |
122 { | |
123 readers->targets_exclude = 1; | |
124 targets++; | |
125 } | |
126 readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); | |
127 if ( !readers->targets ) return -1; | |
128 readers->targets_als = alleles; | |
129 return 0; | |
130 } | |
131 | |
132 int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) | |
133 { | |
134 htsFile* file_ptr = hts_open(fname, "r"); | |
135 if ( ! file_ptr ) { | |
136 files->errnum = open_failed; | |
137 return 0; | |
138 } | |
139 | |
140 files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); | |
141 files->has_line[files->nreaders] = 0; | |
142 files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); | |
143 bcf_sr_t *reader = &files->readers[files->nreaders++]; | |
144 memset(reader,0,sizeof(bcf_sr_t)); | |
145 | |
146 reader->file = file_ptr; | |
147 | |
148 files->errnum = 0; | |
149 | |
150 if ( files->require_index ) | |
151 { | |
152 if ( reader->file->format.format==vcf ) | |
153 { | |
154 if ( reader->file->format.compression!=bgzf ) | |
155 { | |
156 files->errnum = not_bgzf; | |
157 return 0; | |
158 } | |
159 | |
160 reader->tbx_idx = tbx_index_load(fname); | |
161 if ( !reader->tbx_idx ) | |
162 { | |
163 files->errnum = idx_load_failed; | |
164 return 0; | |
165 } | |
166 | |
167 reader->header = bcf_hdr_read(reader->file); | |
168 } | |
169 else if ( reader->file->format.format==bcf ) | |
170 { | |
171 if ( reader->file->format.compression!=bgzf ) | |
172 { | |
173 files->errnum = not_bgzf; | |
174 return 0; | |
175 } | |
176 | |
177 reader->header = bcf_hdr_read(reader->file); | |
178 | |
179 reader->bcf_idx = bcf_index_load(fname); | |
180 if ( !reader->bcf_idx ) | |
181 { | |
182 files->errnum = idx_load_failed; | |
183 return 0; | |
184 } | |
185 } | |
186 else | |
187 { | |
188 files->errnum = file_type_error; | |
189 return 0; | |
190 } | |
191 } | |
192 else | |
193 { | |
194 if ( reader->file->format.format==bcf || reader->file->format.format==vcf ) | |
195 { | |
196 reader->header = bcf_hdr_read(reader->file); | |
197 } | |
198 else | |
199 { | |
200 files->errnum = file_type_error; | |
201 return 0; | |
202 } | |
203 files->streaming = 1; | |
204 } | |
205 if ( files->streaming && files->nreaders>1 ) | |
206 { | |
207 files->errnum = api_usage_error; | |
208 fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); | |
209 return 0; | |
210 } | |
211 if ( files->streaming && files->regions ) | |
212 { | |
213 files->errnum = api_usage_error; | |
214 fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); | |
215 return 0; | |
216 } | |
217 if ( !reader->header ) | |
218 { | |
219 files->errnum = header_error; | |
220 return 0; | |
221 } | |
222 | |
223 reader->fname = fname; | |
224 if ( files->apply_filters ) | |
225 reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); | |
226 | |
227 // Update list of chromosomes | |
228 if ( !files->explicit_regs && !files->streaming ) | |
229 { | |
230 int n,i; | |
231 const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); | |
232 for (i=0; i<n; i++) | |
233 { | |
234 if ( !files->regions ) | |
235 files->regions = _regions_init_string(names[i]); | |
236 else | |
237 _regions_add(files->regions, names[i], -1, -1); | |
238 } | |
239 free(names); | |
240 } | |
241 | |
242 return 1; | |
243 } | |
244 | |
245 bcf_srs_t *bcf_sr_init(void) | |
246 { | |
247 bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); | |
248 return files; | |
249 } | |
250 | |
251 static void bcf_sr_destroy1(bcf_sr_t *reader) | |
252 { | |
253 if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); | |
254 if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); | |
255 bcf_hdr_destroy(reader->header); | |
256 hts_close(reader->file); | |
257 if ( reader->itr ) tbx_itr_destroy(reader->itr); | |
258 int j; | |
259 for (j=0; j<reader->mbuffer; j++) | |
260 bcf_destroy1(reader->buffer[j]); | |
261 free(reader->buffer); | |
262 free(reader->samples); | |
263 free(reader->filter_ids); | |
264 } | |
265 void bcf_sr_destroy(bcf_srs_t *files) | |
266 { | |
267 int i; | |
268 for (i=0; i<files->nreaders; i++) | |
269 bcf_sr_destroy1(&files->readers[i]); | |
270 free(files->has_line); | |
271 free(files->readers); | |
272 for (i=0; i<files->n_smpl; i++) free(files->samples[i]); | |
273 free(files->samples); | |
274 if (files->targets) bcf_sr_regions_destroy(files->targets); | |
275 if (files->regions) bcf_sr_regions_destroy(files->regions); | |
276 if ( files->tmps.m ) free(files->tmps.s); | |
277 free(files); | |
278 } | |
279 | |
280 void bcf_sr_remove_reader(bcf_srs_t *files, int i) | |
281 { | |
282 assert( !files->samples ); // not ready for this yet | |
283 bcf_sr_destroy1(&files->readers[i]); | |
284 if ( i+1 < files->nreaders ) | |
285 { | |
286 memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t)); | |
287 memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int)); | |
288 } | |
289 files->nreaders--; | |
290 } | |
291 | |
292 | |
293 /* | |
294 Removes duplicate records from the buffer. The meaning of "duplicate" is | |
295 controlled by the $collapse variable, which can cause that from multiple | |
296 <indel|snp|any> lines only the first is considered and the rest is ignored. | |
297 The removal is done by setting the redundant lines' positions to -1 and | |
298 moving these lines at the end of the buffer. | |
299 */ | |
300 static void collapse_buffer(bcf_srs_t *files, bcf_sr_t *reader) | |
301 { | |
302 int irec,jrec, has_snp=0, has_indel=0, has_any=0; | |
303 for (irec=1; irec<=reader->nbuffer; irec++) | |
304 { | |
305 bcf1_t *line = reader->buffer[irec]; | |
306 if ( line->pos != reader->buffer[1]->pos ) break; | |
307 if ( files->collapse&COLLAPSE_ANY ) | |
308 { | |
309 if ( !has_any ) has_any = 1; | |
310 else line->pos = -1; | |
311 } | |
312 int line_type = bcf_get_variant_types(line); | |
313 if ( files->collapse&COLLAPSE_SNPS && line_type&(VCF_SNP|VCF_MNP) ) | |
314 { | |
315 if ( !has_snp ) has_snp = 1; | |
316 else line->pos = -1; | |
317 } | |
318 if ( files->collapse&COLLAPSE_INDELS && line_type&VCF_INDEL ) | |
319 { | |
320 if ( !has_indel ) has_indel = 1; | |
321 else line->pos = -1; | |
322 } | |
323 } | |
324 bcf1_t *tmp; | |
325 irec = jrec = 1; | |
326 while ( irec<=reader->nbuffer && jrec<=reader->nbuffer ) | |
327 { | |
328 if ( reader->buffer[irec]->pos != -1 ) { irec++; continue; } | |
329 if ( jrec<=irec ) jrec = irec+1; | |
330 while ( jrec<=reader->nbuffer && reader->buffer[jrec]->pos==-1 ) jrec++; | |
331 if ( jrec<=reader->nbuffer ) | |
332 { | |
333 tmp = reader->buffer[irec]; reader->buffer[irec] = reader->buffer[jrec]; reader->buffer[jrec] = tmp; | |
334 } | |
335 } | |
336 reader->nbuffer = irec - 1; | |
337 } | |
338 | |
339 void debug_buffer(FILE *fp, bcf_sr_t *reader) | |
340 { | |
341 int j; | |
342 for (j=0; j<=reader->nbuffer; j++) | |
343 { | |
344 bcf1_t *line = reader->buffer[j]; | |
345 fprintf(fp,"%s%s\t%s:%d\t%s ", reader->fname,j==0?"*":"",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); | |
346 int k; | |
347 for (k=1; k<line->n_allele; k++) fprintf(fp," %s", line->d.allele[k]); | |
348 fprintf(fp,"\n"); | |
349 } | |
350 } | |
351 | |
352 void debug_buffers(FILE *fp, bcf_srs_t *files) | |
353 { | |
354 int i; | |
355 for (i=0; i<files->nreaders; i++) | |
356 { | |
357 fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname); | |
358 debug_buffer(fp, &files->readers[i]); | |
359 } | |
360 fprintf(fp,"\n"); | |
361 } | |
362 | |
363 static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) | |
364 { | |
365 int i, j; | |
366 if ( !line->d.n_flt ) | |
367 { | |
368 for (j=0; j<reader->nfilter_ids; j++) | |
369 if ( reader->filter_ids[j]<0 ) return 1; | |
370 return 0; | |
371 } | |
372 for (i=0; i<line->d.n_flt; i++) | |
373 { | |
374 for (j=0; j<reader->nfilter_ids; j++) | |
375 if ( line->d.flt[i]==reader->filter_ids[j] ) return 1; | |
376 } | |
377 return 0; | |
378 } | |
379 | |
380 static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) | |
381 { | |
382 if ( end>=MAX_CSI_COOR ) | |
383 { | |
384 fprintf(stderr,"The coordinate is out of csi index limit: %d\n", end+1); | |
385 exit(1); | |
386 } | |
387 if ( reader->itr ) | |
388 { | |
389 hts_itr_destroy(reader->itr); | |
390 reader->itr = NULL; | |
391 } | |
392 reader->nbuffer = 0; | |
393 if ( reader->tbx_idx ) | |
394 { | |
395 int tid = tbx_name2id(reader->tbx_idx, seq); | |
396 if ( tid==-1 ) return -1; // the sequence not present in this file | |
397 reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1); | |
398 } | |
399 else | |
400 { | |
401 int tid = bcf_hdr_name2id(reader->header, seq); | |
402 if ( tid==-1 ) return -1; // the sequence not present in this file | |
403 reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1); | |
404 } | |
405 assert(reader->itr); | |
406 return 0; | |
407 } | |
408 | |
409 /* | |
410 * _readers_next_region() - jumps to next region if necessary | |
411 * Returns 0 on success or -1 when there are no more regions left | |
412 */ | |
413 static int _readers_next_region(bcf_srs_t *files) | |
414 { | |
415 // Need to open new chromosome? Check number of lines in all readers' buffers | |
416 int i, eos = 0; | |
417 for (i=0; i<files->nreaders; i++) | |
418 if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++; | |
419 | |
420 if ( eos!=files->nreaders ) | |
421 { | |
422 // Some of the readers still has buffered lines | |
423 return 0; | |
424 } | |
425 | |
426 // No lines in the buffer, need to open new region or quit | |
427 if ( bcf_sr_regions_next(files->regions)<0 ) return -1; | |
428 | |
429 for (i=0; i<files->nreaders; i++) | |
430 _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); | |
431 | |
432 return 0; | |
433 } | |
434 | |
435 /* | |
436 * _reader_fill_buffer() - buffers all records with the same coordinate | |
437 */ | |
438 static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) | |
439 { | |
440 // Return if the buffer is full: the coordinate of the last buffered record differs | |
441 if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return; | |
442 | |
443 // No iterator (sequence not present in this file) and not streaming | |
444 if ( !reader->itr && !files->streaming ) return; | |
445 | |
446 // Fill the buffer with records starting at the same position | |
447 int i, ret = 0; | |
448 while (1) | |
449 { | |
450 if ( reader->nbuffer+1 >= reader->mbuffer ) | |
451 { | |
452 // Increase buffer size | |
453 reader->mbuffer += 8; | |
454 reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer); | |
455 for (i=8; i>0; i--) // initialize | |
456 { | |
457 reader->buffer[reader->mbuffer-i] = bcf_init1(); | |
458 reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; | |
459 reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 | |
460 } | |
461 } | |
462 if ( files->streaming ) | |
463 { | |
464 if ( reader->file->format.format==vcf ) | |
465 { | |
466 if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines | |
467 int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); | |
468 if ( ret<0 ) break; | |
469 } | |
470 else if ( reader->file->format.format==bcf ) | |
471 { | |
472 if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines | |
473 } | |
474 else | |
475 { | |
476 fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__); | |
477 exit(1); | |
478 } | |
479 } | |
480 else if ( reader->tbx_idx ) | |
481 { | |
482 if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines | |
483 vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); | |
484 } | |
485 else | |
486 { | |
487 if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines | |
488 bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); | |
489 } | |
490 | |
491 // apply filter | |
492 if ( !reader->nfilter_ids ) | |
493 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR); | |
494 else | |
495 { | |
496 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT); | |
497 if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue; | |
498 } | |
499 reader->nbuffer++; | |
500 | |
501 if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full | |
502 } | |
503 if ( ret<0 ) | |
504 { | |
505 // done for this region | |
506 tbx_itr_destroy(reader->itr); | |
507 reader->itr = NULL; | |
508 } | |
509 if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos ) | |
510 collapse_buffer(files, reader); | |
511 } | |
512 | |
513 /* | |
514 * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position | |
515 */ | |
516 static void _reader_shift_buffer(bcf_sr_t *reader) | |
517 { | |
518 int i; | |
519 for (i=2; i<=reader->nbuffer; i++) | |
520 if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; | |
521 if ( i<=reader->nbuffer ) | |
522 { | |
523 // A record with a different position follows, swap it. Because of the reader's logic, | |
524 // only one such line can be present. | |
525 bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; | |
526 reader->nbuffer = 1; | |
527 } | |
528 else | |
529 reader->nbuffer = 0; // no other line | |
530 } | |
531 | |
532 /* | |
533 * _reader_match_alleles() - from multiple buffered lines selects the one which | |
534 * corresponds best to the template line. The logic is controlled by COLLAPSE_* | |
535 * Returns 0 on success or -1 when no good matching line is found. | |
536 */ | |
537 static int _reader_match_alleles(bcf_srs_t *files, bcf_sr_t *reader, bcf1_t *tmpl) | |
538 { | |
539 int i, irec = -1; | |
540 | |
541 // if no template given, use the first available record | |
542 if ( !tmpl ) | |
543 irec = 1; | |
544 else | |
545 { | |
546 int tmpl_type = bcf_get_variant_types(tmpl); | |
547 for (i=1; i<=reader->nbuffer; i++) | |
548 { | |
549 bcf1_t *line = reader->buffer[i]; | |
550 if ( line->pos != reader->buffer[1]->pos ) break; // done with this reader | |
551 | |
552 // Easiest case: matching by position only | |
553 if ( files->collapse&COLLAPSE_ANY ) { irec=i; break; } | |
554 | |
555 int line_type = bcf_get_variant_types(line); | |
556 | |
557 // No matter what the alleles are, as long as they are both SNPs | |
558 if ( files->collapse&COLLAPSE_SNPS && tmpl_type&VCF_SNP && line_type&VCF_SNP ) { irec=i; break; } | |
559 // ... or indels | |
560 if ( files->collapse&COLLAPSE_INDELS && tmpl_type&VCF_INDEL && line_type&VCF_INDEL ) { irec=i; break; } | |
561 | |
562 // More thorough checking: REFs must match | |
563 if ( tmpl->rlen != line->rlen ) continue; // different length | |
564 if ( strcmp(tmpl->d.allele[0], line->d.allele[0]) ) continue; // the strings do not match | |
565 | |
566 int ial,jal; | |
567 if ( files->collapse==COLLAPSE_NONE ) | |
568 { | |
569 // Exact match, all alleles must be identical | |
570 if ( tmpl->n_allele!=line->n_allele ) continue; // different number of alleles, skip | |
571 | |
572 int nmatch = 1; // REF has been already checked | |
573 for (ial=1; ial<tmpl->n_allele; ial++) | |
574 { | |
575 for (jal=1; jal<line->n_allele; jal++) | |
576 if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; } | |
577 } | |
578 if ( nmatch==tmpl->n_allele ) { irec=i; break; } // found: exact match | |
579 continue; | |
580 } | |
581 | |
582 if ( line->n_allele==1 && tmpl->n_allele==1 ) { irec=i; break; } // both sites are non-variant | |
583 | |
584 // COLLAPSE_SOME: at least some ALTs must match | |
585 for (ial=1; ial<tmpl->n_allele; ial++) | |
586 { | |
587 for (jal=1; jal<line->n_allele; jal++) | |
588 if ( !strcmp(tmpl->d.allele[ial], line->d.allele[jal]) ) { irec=i; break; } | |
589 if ( irec>=1 ) break; | |
590 } | |
591 if ( irec>=1 ) break; | |
592 } | |
593 if ( irec==-1 ) return -1; // no matching line was found | |
594 } | |
595 | |
596 // Set the selected line (irec) as active: set it to buffer[0], move the remaining lines forward | |
597 // and put the old bcf1_t record at the end. | |
598 bcf1_t *tmp = reader->buffer[0]; | |
599 reader->buffer[0] = reader->buffer[irec]; | |
600 for (i=irec+1; i<=reader->nbuffer; i++) reader->buffer[i-1] = reader->buffer[i]; | |
601 reader->buffer[ reader->nbuffer ] = tmp; | |
602 reader->nbuffer--; | |
603 | |
604 return 0; | |
605 } | |
606 | |
607 int _reader_next_line(bcf_srs_t *files) | |
608 { | |
609 int i, min_pos = INT_MAX; | |
610 | |
611 // Loop until next suitable line is found or all readers have finished | |
612 while ( 1 ) | |
613 { | |
614 // Get all readers ready for the next region. | |
615 if ( files->regions && _readers_next_region(files)<0 ) break; | |
616 | |
617 // Fill buffers | |
618 const char *chr = NULL; | |
619 for (i=0; i<files->nreaders; i++) | |
620 { | |
621 _reader_fill_buffer(files, &files->readers[i]); | |
622 | |
623 // Update the minimum coordinate | |
624 if ( !files->readers[i].nbuffer ) continue; | |
625 if ( min_pos > files->readers[i].buffer[1]->pos ) | |
626 { | |
627 min_pos = files->readers[i].buffer[1]->pos; | |
628 chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]); | |
629 } | |
630 } | |
631 if ( min_pos==INT_MAX ) | |
632 { | |
633 if ( !files->regions ) break; | |
634 continue; | |
635 } | |
636 | |
637 // Skip this position if not present in targets | |
638 if ( files->targets ) | |
639 { | |
640 int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos); | |
641 if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) ) | |
642 { | |
643 // Remove all lines with this position from the buffer | |
644 for (i=0; i<files->nreaders; i++) | |
645 if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) | |
646 _reader_shift_buffer(&files->readers[i]); | |
647 min_pos = INT_MAX; | |
648 continue; | |
649 } | |
650 } | |
651 | |
652 break; // done: min_pos is set | |
653 } | |
654 | |
655 // There can be records with duplicate positions. Set the active line intelligently so that | |
656 // the alleles match. | |
657 int nret = 0; // number of readers sharing the position | |
658 bcf1_t *first = NULL; // record which will be used for allele matching | |
659 for (i=0; i<files->nreaders; i++) | |
660 { | |
661 files->has_line[i] = 0; | |
662 | |
663 // Skip readers with no records at this position | |
664 if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue; | |
665 | |
666 // Until now buffer[0] of all reader was empty and the lines started at buffer[1]. | |
667 // Now lines which are ready to be output will be moved to buffer[0]. | |
668 if ( _reader_match_alleles(files, &files->readers[i], first) < 0 ) continue; | |
669 if ( !first ) first = files->readers[i].buffer[0]; | |
670 | |
671 nret++; | |
672 files->has_line[i] = 1; | |
673 } | |
674 return nret; | |
675 } | |
676 | |
677 int bcf_sr_next_line(bcf_srs_t *files) | |
678 { | |
679 if ( !files->targets_als ) | |
680 return _reader_next_line(files); | |
681 | |
682 while (1) | |
683 { | |
684 int i, ret = _reader_next_line(files); | |
685 if ( !ret ) return ret; | |
686 | |
687 for (i=0; i<files->nreaders; i++) | |
688 if ( files->has_line[i] ) break; | |
689 | |
690 if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret; | |
691 | |
692 // Check if there are more duplicate lines in the buffers. If not, return this line as if it | |
693 // matched the targets, even if there is a type mismatch | |
694 for (i=0; i<files->nreaders; i++) | |
695 { | |
696 if ( !files->has_line[i] ) continue; | |
697 if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue; | |
698 break; | |
699 } | |
700 if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type | |
701 } | |
702 } | |
703 | |
704 static void bcf_sr_seek_start(bcf_srs_t *readers) | |
705 { | |
706 bcf_sr_regions_t *reg = readers->regions; | |
707 int i; | |
708 for (i=0; i<reg->nseqs; i++) | |
709 reg->regs[i].creg = -1; | |
710 reg->iseq = 0; | |
711 } | |
712 | |
713 | |
714 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos) | |
715 { | |
716 if ( !seq && !pos ) | |
717 { | |
718 // seek to start | |
719 bcf_sr_seek_start(readers); | |
720 return 0; | |
721 } | |
722 | |
723 bcf_sr_regions_overlap(readers->regions, seq, pos, pos); | |
724 int i, nret = 0; | |
725 for (i=0; i<readers->nreaders; i++) | |
726 { | |
727 nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); | |
728 } | |
729 return nret; | |
730 } | |
731 | |
732 int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) | |
733 { | |
734 int i, j, nsmpl, free_smpl = 0; | |
735 char **smpl = NULL; | |
736 | |
737 void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL; | |
738 if ( exclude || strcmp("-",fname) ) // "-" stands for all samples | |
739 { | |
740 smpl = hts_readlist(fname, is_file, &nsmpl); | |
741 if ( !smpl ) | |
742 { | |
743 fprintf(stderr,"Could not read the file: \"%s\"\n", fname); | |
744 return 0; | |
745 } | |
746 if ( exclude ) | |
747 { | |
748 for (i=0; i<nsmpl; i++) | |
749 khash_str2int_inc(exclude, smpl[i]); | |
750 } | |
751 free_smpl = 1; | |
752 } | |
753 if ( !smpl ) | |
754 { | |
755 smpl = files->readers[0].header->samples; // intersection of all samples | |
756 nsmpl = bcf_hdr_nsamples(files->readers[0].header); | |
757 } | |
758 | |
759 files->samples = NULL; | |
760 files->n_smpl = 0; | |
761 for (i=0; i<nsmpl; i++) | |
762 { | |
763 if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue; | |
764 | |
765 int n_isec = 0; | |
766 for (j=0; j<files->nreaders; j++) | |
767 { | |
768 if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break; | |
769 n_isec++; | |
770 } | |
771 if ( n_isec!=files->nreaders ) | |
772 { | |
773 fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname); | |
774 continue; | |
775 } | |
776 | |
777 files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*)); | |
778 files->samples[files->n_smpl++] = strdup(smpl[i]); | |
779 } | |
780 | |
781 if ( exclude ) khash_str2int_destroy(exclude); | |
782 if ( free_smpl ) | |
783 { | |
784 for (i=0; i<nsmpl; i++) free(smpl[i]); | |
785 free(smpl); | |
786 } | |
787 | |
788 if ( !files->n_smpl ) | |
789 { | |
790 if ( files->nreaders>1 ) | |
791 fprintf(stderr,"No samples in common.\n"); | |
792 return 0; | |
793 } | |
794 for (i=0; i<files->nreaders; i++) | |
795 { | |
796 bcf_sr_t *reader = &files->readers[i]; | |
797 reader->samples = (int*) malloc(sizeof(int)*files->n_smpl); | |
798 reader->n_smpl = files->n_smpl; | |
799 for (j=0; j<files->n_smpl; j++) | |
800 reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]); | |
801 } | |
802 return 1; | |
803 } | |
804 | |
805 // Add a new region into a list sorted by start,end. On input the coordinates | |
806 // are 1-based, stored 0-based, inclusive. | |
807 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end) | |
808 { | |
809 if ( start==-1 && end==-1 ) | |
810 { | |
811 start = 0; end = MAX_CSI_COOR-1; | |
812 } | |
813 else | |
814 { | |
815 start--; end--; // store 0-based coordinates | |
816 } | |
817 | |
818 if ( !reg->seq_hash ) | |
819 reg->seq_hash = khash_str2int_init(); | |
820 | |
821 int iseq; | |
822 if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 ) | |
823 { | |
824 // the chromosome block does not exist | |
825 iseq = reg->nseqs++; | |
826 reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs); | |
827 reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs); | |
828 memset(®->regs[reg->nseqs-1],0,sizeof(region_t)); | |
829 reg->seq_names[iseq] = strdup(chr); | |
830 reg->regs[iseq].creg = -1; | |
831 khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq); | |
832 } | |
833 | |
834 region_t *creg = ®->regs[iseq]; | |
835 | |
836 // the regions may not be sorted on input: binary search | |
837 int i, min = 0, max = creg->nregs - 1; | |
838 while ( min<=max ) | |
839 { | |
840 i = (max+min)/2; | |
841 if ( start < creg->regs[i].start ) max = i - 1; | |
842 else if ( start > creg->regs[i].start ) min = i + 1; | |
843 else break; | |
844 } | |
845 if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end ) | |
846 { | |
847 // no such region, insert a new one just after max | |
848 hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs); | |
849 if ( ++max < creg->nregs ) | |
850 memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t)); | |
851 creg->regs[max].start = start; | |
852 creg->regs[max].end = end; | |
853 creg->nregs++; | |
854 } | |
855 } | |
856 | |
857 // File name or a list of genomic locations. If file name, NULL is returned. | |
858 static bcf_sr_regions_t *_regions_init_string(const char *str) | |
859 { | |
860 bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); | |
861 reg->start = reg->end = -1; | |
862 reg->prev_start = reg->prev_seq = -1; | |
863 | |
864 kstring_t tmp = {0,0,0}; | |
865 const char *sp = str, *ep = str; | |
866 int from, to; | |
867 while ( 1 ) | |
868 { | |
869 while ( *ep && *ep!=',' && *ep!=':' ) ep++; | |
870 tmp.l = 0; | |
871 kputsn(sp,ep-sp,&tmp); | |
872 if ( *ep==':' ) | |
873 { | |
874 sp = ep+1; | |
875 from = strtol(sp,(char**)&ep,10); | |
876 if ( sp==ep ) | |
877 { | |
878 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); | |
879 free(reg); free(tmp.s); return NULL; | |
880 } | |
881 if ( !*ep || *ep==',' ) | |
882 { | |
883 _regions_add(reg, tmp.s, from, from); | |
884 sp = ep; | |
885 continue; | |
886 } | |
887 if ( *ep!='-' ) | |
888 { | |
889 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); | |
890 free(reg); free(tmp.s); return NULL; | |
891 } | |
892 ep++; | |
893 sp = ep; | |
894 to = strtol(sp,(char**)&ep,10); | |
895 if ( *ep && *ep!=',' ) | |
896 { | |
897 fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); | |
898 free(reg); free(tmp.s); return NULL; | |
899 } | |
900 if ( sp==ep ) to = MAX_CSI_COOR-1; | |
901 _regions_add(reg, tmp.s, from, to); | |
902 if ( !*ep ) break; | |
903 sp = ep; | |
904 } | |
905 else | |
906 { | |
907 if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); | |
908 if ( !*ep ) break; | |
909 sp = ++ep; | |
910 } | |
911 } | |
912 free(tmp.s); | |
913 return reg; | |
914 } | |
915 | |
916 // ichr,ifrom,ito are 0-based; | |
917 // returns -1 on error, 0 if the line is a comment line, 1 on success | |
918 static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to) | |
919 { | |
920 *chr_end = NULL; | |
921 | |
922 if ( line[0]=='#' ) return 0; | |
923 | |
924 int k,l; // index of the start and end column of the tab-delimited file | |
925 if ( ifrom <= ito ) | |
926 k = ifrom, l = ito; | |
927 else | |
928 l = ifrom, k = ito; | |
929 | |
930 int i; | |
931 char *se = line, *ss = NULL; // start and end | |
932 char *tmp; | |
933 for (i=0; i<=k && *se; i++) | |
934 { | |
935 ss = i==0 ? se++ : ++se; | |
936 while (*se && *se!='\t') se++; | |
937 } | |
938 if ( i<=k ) return -1; | |
939 if ( k==l ) | |
940 { | |
941 *from = *to = strtol(ss, &tmp, 10); | |
942 if ( tmp==ss ) return -1; | |
943 } | |
944 else | |
945 { | |
946 if ( k==ifrom ) | |
947 *from = strtol(ss, &tmp, 10); | |
948 else | |
949 *to = strtol(ss, &tmp, 10); | |
950 if ( ss==tmp ) return -1; | |
951 | |
952 for (i=k; i<l && *se; i++) | |
953 { | |
954 ss = ++se; | |
955 while (*se && *se!='\t') se++; | |
956 } | |
957 if ( i<l ) return -1; | |
958 if ( k==ifrom ) | |
959 *to = strtol(ss, &tmp, 10); | |
960 else | |
961 *from = strtol(ss, &tmp, 10); | |
962 if ( ss==tmp ) return -1; | |
963 } | |
964 | |
965 ss = se = line; | |
966 for (i=0; i<=ichr && *se; i++) | |
967 { | |
968 if ( i>0 ) ss = ++se; | |
969 while (*se && *se!='\t') se++; | |
970 } | |
971 if ( i<=ichr ) return -1; | |
972 *chr_end = se; | |
973 *chr = ss; | |
974 return 1; | |
975 } | |
976 | |
977 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) | |
978 { | |
979 bcf_sr_regions_t *reg; | |
980 if ( !is_file ) return _regions_init_string(regions); | |
981 | |
982 reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); | |
983 reg->start = reg->end = -1; | |
984 reg->prev_start = reg->prev_seq = -1; | |
985 | |
986 reg->file = hts_open(regions, "rb"); | |
987 if ( !reg->file ) | |
988 { | |
989 fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); | |
990 free(reg); | |
991 return NULL; | |
992 } | |
993 | |
994 reg->tbx = tbx_index_load(regions); | |
995 if ( !reg->tbx ) | |
996 { | |
997 int len = strlen(regions); | |
998 int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; | |
999 if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; | |
1000 | |
1001 if ( reg->file->format.format==vcf ) ito = 1; | |
1002 | |
1003 // read the whole file, tabix index is not present | |
1004 while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) | |
1005 { | |
1006 char *chr, *chr_end; | |
1007 int from, to, ret; | |
1008 ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); | |
1009 if ( ret < 0 ) | |
1010 { | |
1011 if ( ito<0 ) | |
1012 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); | |
1013 if ( ret<0 ) | |
1014 { | |
1015 fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); | |
1016 hts_close(reg->file); reg->file = NULL; free(reg); | |
1017 return NULL; | |
1018 } | |
1019 } | |
1020 if ( !ret ) continue; | |
1021 if ( is_bed ) from++; | |
1022 *chr_end = 0; | |
1023 _regions_add(reg, chr, from, to); | |
1024 *chr_end = '\t'; | |
1025 } | |
1026 hts_close(reg->file); reg->file = NULL; | |
1027 if ( !reg->nseqs ) { free(reg); return NULL; } | |
1028 return reg; | |
1029 } | |
1030 | |
1031 reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); | |
1032 if ( !reg->seq_hash ) | |
1033 reg->seq_hash = khash_str2int_init(); | |
1034 int i; | |
1035 for (i=0; i<reg->nseqs; i++) | |
1036 { | |
1037 khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); | |
1038 } | |
1039 reg->fname = strdup(regions); | |
1040 reg->is_bin = 1; | |
1041 return reg; | |
1042 } | |
1043 | |
1044 void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) | |
1045 { | |
1046 int i; | |
1047 free(reg->fname); | |
1048 if ( reg->itr ) tbx_itr_destroy(reg->itr); | |
1049 if ( reg->tbx ) tbx_destroy(reg->tbx); | |
1050 if ( reg->file ) hts_close(reg->file); | |
1051 if ( reg->als ) free(reg->als); | |
1052 if ( reg->als_str.s ) free(reg->als_str.s); | |
1053 free(reg->line.s); | |
1054 if ( reg->regs ) | |
1055 { | |
1056 // free only in-memory names, tbx names are const | |
1057 for (i=0; i<reg->nseqs; i++) | |
1058 { | |
1059 free(reg->seq_names[i]); | |
1060 free(reg->regs[i].regs); | |
1061 } | |
1062 } | |
1063 free(reg->regs); | |
1064 free(reg->seq_names); | |
1065 khash_str2int_destroy(reg->seq_hash); | |
1066 free(reg); | |
1067 } | |
1068 | |
1069 int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) | |
1070 { | |
1071 reg->iseq = reg->start = reg->end = -1; | |
1072 if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions | |
1073 | |
1074 // using in-memory regions | |
1075 if ( reg->regs ) | |
1076 { | |
1077 reg->regs[reg->iseq].creg = -1; | |
1078 return 0; | |
1079 } | |
1080 | |
1081 // reading regions from tabix | |
1082 if ( reg->itr ) tbx_itr_destroy(reg->itr); | |
1083 reg->itr = tbx_itr_querys(reg->tbx, seq); | |
1084 if ( reg->itr ) return 0; | |
1085 | |
1086 return -1; | |
1087 } | |
1088 | |
1089 int bcf_sr_regions_next(bcf_sr_regions_t *reg) | |
1090 { | |
1091 if ( reg->iseq<0 ) return -1; | |
1092 reg->start = reg->end = -1; | |
1093 reg->nals = 0; | |
1094 | |
1095 // using in-memory regions | |
1096 if ( reg->regs ) | |
1097 { | |
1098 while ( reg->iseq < reg->nseqs ) | |
1099 { | |
1100 reg->regs[reg->iseq].creg++; | |
1101 if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break; | |
1102 reg->iseq++; | |
1103 } | |
1104 if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left | |
1105 region1_t *creg = ®->regs[reg->iseq].regs[reg->regs[reg->iseq].creg]; | |
1106 reg->start = creg->start; | |
1107 reg->end = creg->end; | |
1108 return 0; | |
1109 } | |
1110 | |
1111 // reading from tabix | |
1112 char *chr, *chr_end; | |
1113 int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to; | |
1114 if ( reg->tbx ) | |
1115 { | |
1116 ichr = reg->tbx->conf.sc-1; | |
1117 ifrom = reg->tbx->conf.bc-1; | |
1118 ito = reg->tbx->conf.ec-1; | |
1119 if ( ito<0 ) ito = ifrom; | |
1120 is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0; | |
1121 } | |
1122 | |
1123 int ret = 0; | |
1124 while ( !ret ) | |
1125 { | |
1126 if ( reg->itr ) | |
1127 { | |
1128 // tabix index present, reading a chromosome block | |
1129 ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line); | |
1130 if ( ret<0 ) { reg->iseq = -1; return -1; } | |
1131 } | |
1132 else | |
1133 { | |
1134 if ( reg->is_bin ) | |
1135 { | |
1136 // Waited for seek which never came. Reopen in text mode and stream | |
1137 // through the regions, otherwise hts_getline would fail | |
1138 hts_close(reg->file); | |
1139 reg->file = hts_open(reg->fname, "r"); | |
1140 if ( !reg->file ) | |
1141 { | |
1142 fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,reg->fname); | |
1143 reg->file = NULL; | |
1144 bcf_sr_regions_destroy(reg); | |
1145 return -1; | |
1146 } | |
1147 reg->is_bin = 0; | |
1148 } | |
1149 | |
1150 // tabix index absent, reading the whole file | |
1151 ret = hts_getline(reg->file, KS_SEP_LINE, ®->line); | |
1152 if ( ret<0 ) { reg->iseq = -1; return -1; } | |
1153 } | |
1154 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to); | |
1155 if ( ret<0 ) | |
1156 { | |
1157 fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d,%d\n", __FILE__,__LINE__,reg->fname,ichr+1,ifrom+1,ito+1); | |
1158 return -1; | |
1159 } | |
1160 } | |
1161 if ( is_bed ) from++; | |
1162 | |
1163 *chr_end = 0; | |
1164 if ( khash_str2int_get(reg->seq_hash, chr, ®->iseq)<0 ) | |
1165 { | |
1166 fprintf(stderr,"Broken tabix index? The sequence \"%s\" not in dictionary [%s]\n", chr,reg->line.s); | |
1167 exit(1); | |
1168 } | |
1169 *chr_end = '\t'; | |
1170 | |
1171 reg->start = from - 1; | |
1172 reg->end = to - 1; | |
1173 return 0; | |
1174 } | |
1175 | |
1176 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec) | |
1177 { | |
1178 int i = 0, max_len = 0; | |
1179 if ( !reg->nals ) | |
1180 { | |
1181 char *ss = reg->line.s; | |
1182 while ( i<als_idx && *ss ) | |
1183 { | |
1184 if ( *ss=='\t' ) i++; | |
1185 ss++; | |
1186 } | |
1187 char *se = ss; | |
1188 reg->nals = 1; | |
1189 while ( *se && *se!='\t' ) | |
1190 { | |
1191 if ( *se==',' ) reg->nals++; | |
1192 se++; | |
1193 } | |
1194 ks_resize(®->als_str, se-ss+1+reg->nals); | |
1195 reg->als_str.l = 0; | |
1196 hts_expand(char*,reg->nals,reg->mals,reg->als); | |
1197 reg->nals = 0; | |
1198 | |
1199 se = ss; | |
1200 while ( *(++se) ) | |
1201 { | |
1202 if ( *se=='\t' ) break; | |
1203 if ( *se!=',' ) continue; | |
1204 reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; | |
1205 kputsn(ss,se-ss,®->als_str); | |
1206 if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; | |
1207 reg->als_str.l++; | |
1208 reg->nals++; | |
1209 ss = ++se; | |
1210 } | |
1211 reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; | |
1212 kputsn(ss,se-ss,®->als_str); | |
1213 if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; | |
1214 reg->nals++; | |
1215 reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types | |
1216 } | |
1217 int type = bcf_get_variant_types(rec); | |
1218 if ( reg->als_type & VCF_INDEL ) | |
1219 return type & VCF_INDEL ? 1 : 0; | |
1220 return !(type & VCF_INDEL) ? 1 : 0; | |
1221 } | |
1222 | |
1223 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end) | |
1224 { | |
1225 int iseq; | |
1226 if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence | |
1227 | |
1228 if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek | |
1229 { | |
1230 // flush regions left on previous chromosome | |
1231 if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) | |
1232 bcf_sr_regions_flush(reg); | |
1233 | |
1234 bcf_sr_regions_seek(reg, seq); | |
1235 reg->start = reg->end = -1; | |
1236 } | |
1237 if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome | |
1238 reg->prev_seq = reg->iseq; | |
1239 reg->prev_start = start; | |
1240 | |
1241 while ( iseq==reg->iseq && reg->end < start ) | |
1242 { | |
1243 if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left | |
1244 if ( reg->iseq != iseq ) return -1; // does not overlap any regions | |
1245 if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); | |
1246 } | |
1247 if ( reg->start <= end ) return 0; // region overlap | |
1248 return -1; // no overlap | |
1249 } | |
1250 | |
1251 void bcf_sr_regions_flush(bcf_sr_regions_t *reg) | |
1252 { | |
1253 if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return; | |
1254 while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data); | |
1255 return; | |
1256 } | |
1257 |