Mercurial > repos > youngkim > ezbamqc
diff ezBAMQC/src/htslib/test/test-vcf-api.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ezBAMQC/src/htslib/test/test-vcf-api.c Thu Mar 24 17:12:52 2016 -0400 @@ -0,0 +1,282 @@ +/* test/test-vcf-api.c -- VCF test harness. + + Copyright (C) 2013, 2014 Genome Research Ltd. + + Author: Petr Danecek <pd3@sanger.ac.uk> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include <stdio.h> +#include <htslib/hts.h> +#include <htslib/vcf.h> +#include <htslib/kstring.h> +#include <htslib/kseq.h> + +void write_bcf(char *fname) +{ + // Init + htsFile *fp = hts_open(fname,"wb"); + bcf_hdr_t *hdr = bcf_hdr_init("w"); + bcf1_t *rec = bcf_init1(); + + // Create VCF header + kstring_t str = {0,0,0}; + bcf_hdr_append(hdr, "##fileDate=20090805"); + bcf_hdr_append(hdr, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">"); + bcf_hdr_append(hdr, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">"); + bcf_hdr_append(hdr, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">"); + bcf_hdr_append(hdr, "##unused=<XX=AA,Description=\"Unused generic\">"); + bcf_hdr_append(hdr, "##unused=unformatted text 1"); + bcf_hdr_append(hdr, "##unused=unformatted text 2"); + bcf_hdr_append(hdr, "##contig=<ID=Unused,length=62435964>"); + bcf_hdr_append(hdr, "##source=myImputationProgramV3.1"); + bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"); + bcf_hdr_append(hdr, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>"); + bcf_hdr_append(hdr, "##phasing=partial"); + bcf_hdr_append(hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); + bcf_hdr_append(hdr, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">"); + bcf_hdr_append(hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">"); + bcf_hdr_append(hdr, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">"); + bcf_hdr_append(hdr, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">"); + bcf_hdr_append(hdr, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">"); + bcf_hdr_append(hdr, "##FILTER=<ID=q10,Description=\"Quality below 10\">"); + bcf_hdr_append(hdr, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">"); + bcf_hdr_append(hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); + bcf_hdr_append(hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"); + bcf_hdr_append(hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">"); + bcf_hdr_append(hdr, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">"); + bcf_hdr_append(hdr, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">"); + + bcf_hdr_add_sample(hdr, "NA00001"); + bcf_hdr_add_sample(hdr, "NA00002"); + bcf_hdr_add_sample(hdr, "NA00003"); + bcf_hdr_add_sample(hdr, NULL); // to update internal structures + bcf_hdr_write(fp, hdr); + + + // Add a record + // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. + // .. CHROM + rec->rid = bcf_hdr_name2id(hdr, "20"); + // .. POS + rec->pos = 14369; + // .. ID + bcf_update_id(hdr, rec, "rs6054257"); + // .. REF and ALT + bcf_update_alleles_str(hdr, rec, "G,A"); + // .. QUAL + rec->qual = 29; + // .. FILTER + int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); + bcf_update_filter(hdr, rec, &tmpi, 1); + // .. INFO + tmpi = 3; + bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); + tmpi = 14; + bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); + float tmpf = 0.5; + bcf_update_info_float(hdr, rec, "AF", &tmpf, 1); + bcf_update_info_flag(hdr, rec, "DB", NULL, 1); + bcf_update_info_flag(hdr, rec, "H2", NULL, 1); + // .. FORMAT + int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int)); + tmpia[0] = bcf_gt_phased(0); + tmpia[1] = bcf_gt_phased(0); + tmpia[2] = bcf_gt_phased(1); + tmpia[3] = bcf_gt_phased(0); + tmpia[4] = bcf_gt_unphased(1); + tmpia[5] = bcf_gt_unphased(1); + bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); + tmpia[0] = 48; + tmpia[1] = 48; + tmpia[2] = 43; + bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr)); + tmpia[0] = 1; + tmpia[1] = 8; + tmpia[2] = 5; + bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr)); + tmpia[0] = 51; + tmpia[1] = 51; + tmpia[2] = 51; + tmpia[3] = 51; + tmpia[4] = bcf_int32_missing; + tmpia[5] = bcf_int32_missing; + bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2); + char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"}; + bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3); + bcf_write1(fp, hdr, rec); + + // 20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./. + bcf_clear1(rec); + rec->rid = bcf_hdr_name2id(hdr, "20"); + rec->pos = 1110695; + bcf_update_alleles_str(hdr, rec, "A,G,T"); + rec->qual = 67; + tmpi = 2; + bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); + tmpi = 10; + bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); + float *tmpfa = (float*)malloc(2*sizeof(float)); + tmpfa[0] = 0.333; + bcf_float_set_missing(tmpfa[1]); + bcf_update_info_float(hdr, rec, "AF", tmpfa, 2); + bcf_update_info_string(hdr, rec, "AA", "T"); + bcf_update_info_flag(hdr, rec, "DB", NULL, 1); + tmpia[0] = bcf_gt_phased(2); + tmpia[1] = bcf_int32_vector_end; + tmpia[2] = bcf_gt_phased(1); + tmpia[3] = bcf_int32_vector_end; + tmpia[4] = bcf_gt_missing; + tmpia[5] = bcf_gt_missing; + bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); + bcf_write1(fp, hdr, rec); + + free(tmpia); + free(tmpfa); + + // Clean + free(str.s); + bcf_destroy1(rec); + bcf_hdr_destroy(hdr); + int ret; + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } +} + +void bcf_to_vcf(char *fname) +{ + htsFile *fp = hts_open(fname,"rb"); + bcf_hdr_t *hdr = bcf_hdr_read(fp); + bcf1_t *rec = bcf_init1(); + + char *gz_fname = (char*) malloc(strlen(fname)+4); + snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname); + htsFile *out = hts_open(gz_fname,"wg"); + + bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); + bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); + bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused"); + bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt"); + bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI"); + bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF"); + bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused"); + bcf_hdr_write(out, hdr_out); + + while ( bcf_read1(fp, hdr, rec)>=0 ) + { + bcf_write1(out, hdr_out, rec); + + // Test problems caused by bcf1_sync: the data block + // may be realloced, also the unpacked structures must + // get updated. + bcf_unpack(rec, BCF_UN_STR); + bcf_update_id(hdr, rec, 0); + bcf_update_format_int32(hdr, rec, "GQ", NULL, 0); + + bcf1_t *dup = bcf_dup(rec); // force bcf1_sync call + bcf_write1(out, hdr_out, dup); + bcf_destroy1(dup); + + bcf_update_alleles_str(hdr_out, rec, "G,A"); + int32_t tmpi = 99; + bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1); + int32_t tmpia[] = {9,9,9}; + bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3); + + bcf_write1(out, hdr_out, rec); + } + + bcf_destroy1(rec); + bcf_hdr_destroy(hdr); + bcf_hdr_destroy(hdr_out); + int ret; + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } + if ( (ret=hts_close(out)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret); + exit(ret); + } + + + // read gzip, write stdout + htsFile *gz_in = hts_open(gz_fname, "r"); + if ( !gz_in ) + { + fprintf(stderr,"Could not read: %s\n", gz_fname); + exit(1); + } + + kstring_t line = {0,0,0}; + while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 ) + { + kputc('\n',&line); + fwrite(line.s,1,line.l,stdout); + } + + if ( (ret=hts_close(gz_in)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret); + exit(ret); + } + free(line.s); + free(gz_fname); +} + +void iterator(const char *fname) +{ + htsFile *fp = hts_open(fname, "r"); + bcf_hdr_t *hdr = bcf_hdr_read(fp); + hts_idx_t *idx; + hts_itr_t *iter; + + bcf_index_build(fname, 0); + idx = bcf_index_load(fname); + + iter = bcf_itr_queryi(idx, bcf_hdr_name2id(hdr, "20"), 1110600, 1110800); + bcf_itr_destroy(iter); + + iter = bcf_itr_querys(idx, hdr, "20:1110600-1110800"); + bcf_itr_destroy(iter); + + hts_idx_destroy(idx); + bcf_hdr_destroy(hdr); + int ret; + if ( (ret=hts_close(fp)) ) + { + fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); + exit(ret); + } +} + +int main(int argc, char **argv) +{ + char *fname = argc>1 ? argv[1] : "rmme.bcf"; + write_bcf(fname); + bcf_to_vcf(fname); + iterator(fname); + return 0; +} +