Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/test/test-vcf-api.c @ 0:dfa3745e5fd8
Uploaded
| author | youngkim |
|---|---|
| date | Thu, 24 Mar 2016 17:12:52 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfa3745e5fd8 |
|---|---|
| 1 /* test/test-vcf-api.c -- VCF test harness. | |
| 2 | |
| 3 Copyright (C) 2013, 2014 Genome Research Ltd. | |
| 4 | |
| 5 Author: Petr Danecek <pd3@sanger.ac.uk> | |
| 6 | |
| 7 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 8 of this software and associated documentation files (the "Software"), to deal | |
| 9 in the Software without restriction, including without limitation the rights | |
| 10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 11 copies of the Software, and to permit persons to whom the Software is | |
| 12 furnished to do so, subject to the following conditions: | |
| 13 | |
| 14 The above copyright notice and this permission notice shall be included in | |
| 15 all copies or substantial portions of the Software. | |
| 16 | |
| 17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
| 20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 23 DEALINGS IN THE SOFTWARE. */ | |
| 24 | |
| 25 #include <stdio.h> | |
| 26 #include <htslib/hts.h> | |
| 27 #include <htslib/vcf.h> | |
| 28 #include <htslib/kstring.h> | |
| 29 #include <htslib/kseq.h> | |
| 30 | |
| 31 void write_bcf(char *fname) | |
| 32 { | |
| 33 // Init | |
| 34 htsFile *fp = hts_open(fname,"wb"); | |
| 35 bcf_hdr_t *hdr = bcf_hdr_init("w"); | |
| 36 bcf1_t *rec = bcf_init1(); | |
| 37 | |
| 38 // Create VCF header | |
| 39 kstring_t str = {0,0,0}; | |
| 40 bcf_hdr_append(hdr, "##fileDate=20090805"); | |
| 41 bcf_hdr_append(hdr, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">"); | |
| 42 bcf_hdr_append(hdr, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">"); | |
| 43 bcf_hdr_append(hdr, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">"); | |
| 44 bcf_hdr_append(hdr, "##unused=<XX=AA,Description=\"Unused generic\">"); | |
| 45 bcf_hdr_append(hdr, "##unused=unformatted text 1"); | |
| 46 bcf_hdr_append(hdr, "##unused=unformatted text 2"); | |
| 47 bcf_hdr_append(hdr, "##contig=<ID=Unused,length=62435964>"); | |
| 48 bcf_hdr_append(hdr, "##source=myImputationProgramV3.1"); | |
| 49 bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"); | |
| 50 bcf_hdr_append(hdr, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>"); | |
| 51 bcf_hdr_append(hdr, "##phasing=partial"); | |
| 52 bcf_hdr_append(hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); | |
| 53 bcf_hdr_append(hdr, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">"); | |
| 54 bcf_hdr_append(hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">"); | |
| 55 bcf_hdr_append(hdr, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">"); | |
| 56 bcf_hdr_append(hdr, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">"); | |
| 57 bcf_hdr_append(hdr, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">"); | |
| 58 bcf_hdr_append(hdr, "##FILTER=<ID=q10,Description=\"Quality below 10\">"); | |
| 59 bcf_hdr_append(hdr, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">"); | |
| 60 bcf_hdr_append(hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); | |
| 61 bcf_hdr_append(hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"); | |
| 62 bcf_hdr_append(hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">"); | |
| 63 bcf_hdr_append(hdr, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">"); | |
| 64 bcf_hdr_append(hdr, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">"); | |
| 65 | |
| 66 bcf_hdr_add_sample(hdr, "NA00001"); | |
| 67 bcf_hdr_add_sample(hdr, "NA00002"); | |
| 68 bcf_hdr_add_sample(hdr, "NA00003"); | |
| 69 bcf_hdr_add_sample(hdr, NULL); // to update internal structures | |
| 70 bcf_hdr_write(fp, hdr); | |
| 71 | |
| 72 | |
| 73 // Add a record | |
| 74 // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. | |
| 75 // .. CHROM | |
| 76 rec->rid = bcf_hdr_name2id(hdr, "20"); | |
| 77 // .. POS | |
| 78 rec->pos = 14369; | |
| 79 // .. ID | |
| 80 bcf_update_id(hdr, rec, "rs6054257"); | |
| 81 // .. REF and ALT | |
| 82 bcf_update_alleles_str(hdr, rec, "G,A"); | |
| 83 // .. QUAL | |
| 84 rec->qual = 29; | |
| 85 // .. FILTER | |
| 86 int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); | |
| 87 bcf_update_filter(hdr, rec, &tmpi, 1); | |
| 88 // .. INFO | |
| 89 tmpi = 3; | |
| 90 bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); | |
| 91 tmpi = 14; | |
| 92 bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); | |
| 93 float tmpf = 0.5; | |
| 94 bcf_update_info_float(hdr, rec, "AF", &tmpf, 1); | |
| 95 bcf_update_info_flag(hdr, rec, "DB", NULL, 1); | |
| 96 bcf_update_info_flag(hdr, rec, "H2", NULL, 1); | |
| 97 // .. FORMAT | |
| 98 int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int)); | |
| 99 tmpia[0] = bcf_gt_phased(0); | |
| 100 tmpia[1] = bcf_gt_phased(0); | |
| 101 tmpia[2] = bcf_gt_phased(1); | |
| 102 tmpia[3] = bcf_gt_phased(0); | |
| 103 tmpia[4] = bcf_gt_unphased(1); | |
| 104 tmpia[5] = bcf_gt_unphased(1); | |
| 105 bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); | |
| 106 tmpia[0] = 48; | |
| 107 tmpia[1] = 48; | |
| 108 tmpia[2] = 43; | |
| 109 bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr)); | |
| 110 tmpia[0] = 1; | |
| 111 tmpia[1] = 8; | |
| 112 tmpia[2] = 5; | |
| 113 bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr)); | |
| 114 tmpia[0] = 51; | |
| 115 tmpia[1] = 51; | |
| 116 tmpia[2] = 51; | |
| 117 tmpia[3] = 51; | |
| 118 tmpia[4] = bcf_int32_missing; | |
| 119 tmpia[5] = bcf_int32_missing; | |
| 120 bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2); | |
| 121 char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"}; | |
| 122 bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3); | |
| 123 bcf_write1(fp, hdr, rec); | |
| 124 | |
| 125 // 20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./. | |
| 126 bcf_clear1(rec); | |
| 127 rec->rid = bcf_hdr_name2id(hdr, "20"); | |
| 128 rec->pos = 1110695; | |
| 129 bcf_update_alleles_str(hdr, rec, "A,G,T"); | |
| 130 rec->qual = 67; | |
| 131 tmpi = 2; | |
| 132 bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); | |
| 133 tmpi = 10; | |
| 134 bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); | |
| 135 float *tmpfa = (float*)malloc(2*sizeof(float)); | |
| 136 tmpfa[0] = 0.333; | |
| 137 bcf_float_set_missing(tmpfa[1]); | |
| 138 bcf_update_info_float(hdr, rec, "AF", tmpfa, 2); | |
| 139 bcf_update_info_string(hdr, rec, "AA", "T"); | |
| 140 bcf_update_info_flag(hdr, rec, "DB", NULL, 1); | |
| 141 tmpia[0] = bcf_gt_phased(2); | |
| 142 tmpia[1] = bcf_int32_vector_end; | |
| 143 tmpia[2] = bcf_gt_phased(1); | |
| 144 tmpia[3] = bcf_int32_vector_end; | |
| 145 tmpia[4] = bcf_gt_missing; | |
| 146 tmpia[5] = bcf_gt_missing; | |
| 147 bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); | |
| 148 bcf_write1(fp, hdr, rec); | |
| 149 | |
| 150 free(tmpia); | |
| 151 free(tmpfa); | |
| 152 | |
| 153 // Clean | |
| 154 free(str.s); | |
| 155 bcf_destroy1(rec); | |
| 156 bcf_hdr_destroy(hdr); | |
| 157 int ret; | |
| 158 if ( (ret=hts_close(fp)) ) | |
| 159 { | |
| 160 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); | |
| 161 exit(ret); | |
| 162 } | |
| 163 } | |
| 164 | |
| 165 void bcf_to_vcf(char *fname) | |
| 166 { | |
| 167 htsFile *fp = hts_open(fname,"rb"); | |
| 168 bcf_hdr_t *hdr = bcf_hdr_read(fp); | |
| 169 bcf1_t *rec = bcf_init1(); | |
| 170 | |
| 171 char *gz_fname = (char*) malloc(strlen(fname)+4); | |
| 172 snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname); | |
| 173 htsFile *out = hts_open(gz_fname,"wg"); | |
| 174 | |
| 175 bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); | |
| 176 bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); | |
| 177 bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused"); | |
| 178 bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt"); | |
| 179 bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI"); | |
| 180 bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF"); | |
| 181 bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused"); | |
| 182 bcf_hdr_write(out, hdr_out); | |
| 183 | |
| 184 while ( bcf_read1(fp, hdr, rec)>=0 ) | |
| 185 { | |
| 186 bcf_write1(out, hdr_out, rec); | |
| 187 | |
| 188 // Test problems caused by bcf1_sync: the data block | |
| 189 // may be realloced, also the unpacked structures must | |
| 190 // get updated. | |
| 191 bcf_unpack(rec, BCF_UN_STR); | |
| 192 bcf_update_id(hdr, rec, 0); | |
| 193 bcf_update_format_int32(hdr, rec, "GQ", NULL, 0); | |
| 194 | |
| 195 bcf1_t *dup = bcf_dup(rec); // force bcf1_sync call | |
| 196 bcf_write1(out, hdr_out, dup); | |
| 197 bcf_destroy1(dup); | |
| 198 | |
| 199 bcf_update_alleles_str(hdr_out, rec, "G,A"); | |
| 200 int32_t tmpi = 99; | |
| 201 bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1); | |
| 202 int32_t tmpia[] = {9,9,9}; | |
| 203 bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3); | |
| 204 | |
| 205 bcf_write1(out, hdr_out, rec); | |
| 206 } | |
| 207 | |
| 208 bcf_destroy1(rec); | |
| 209 bcf_hdr_destroy(hdr); | |
| 210 bcf_hdr_destroy(hdr_out); | |
| 211 int ret; | |
| 212 if ( (ret=hts_close(fp)) ) | |
| 213 { | |
| 214 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); | |
| 215 exit(ret); | |
| 216 } | |
| 217 if ( (ret=hts_close(out)) ) | |
| 218 { | |
| 219 fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret); | |
| 220 exit(ret); | |
| 221 } | |
| 222 | |
| 223 | |
| 224 // read gzip, write stdout | |
| 225 htsFile *gz_in = hts_open(gz_fname, "r"); | |
| 226 if ( !gz_in ) | |
| 227 { | |
| 228 fprintf(stderr,"Could not read: %s\n", gz_fname); | |
| 229 exit(1); | |
| 230 } | |
| 231 | |
| 232 kstring_t line = {0,0,0}; | |
| 233 while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 ) | |
| 234 { | |
| 235 kputc('\n',&line); | |
| 236 fwrite(line.s,1,line.l,stdout); | |
| 237 } | |
| 238 | |
| 239 if ( (ret=hts_close(gz_in)) ) | |
| 240 { | |
| 241 fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret); | |
| 242 exit(ret); | |
| 243 } | |
| 244 free(line.s); | |
| 245 free(gz_fname); | |
| 246 } | |
| 247 | |
| 248 void iterator(const char *fname) | |
| 249 { | |
| 250 htsFile *fp = hts_open(fname, "r"); | |
| 251 bcf_hdr_t *hdr = bcf_hdr_read(fp); | |
| 252 hts_idx_t *idx; | |
| 253 hts_itr_t *iter; | |
| 254 | |
| 255 bcf_index_build(fname, 0); | |
| 256 idx = bcf_index_load(fname); | |
| 257 | |
| 258 iter = bcf_itr_queryi(idx, bcf_hdr_name2id(hdr, "20"), 1110600, 1110800); | |
| 259 bcf_itr_destroy(iter); | |
| 260 | |
| 261 iter = bcf_itr_querys(idx, hdr, "20:1110600-1110800"); | |
| 262 bcf_itr_destroy(iter); | |
| 263 | |
| 264 hts_idx_destroy(idx); | |
| 265 bcf_hdr_destroy(hdr); | |
| 266 int ret; | |
| 267 if ( (ret=hts_close(fp)) ) | |
| 268 { | |
| 269 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); | |
| 270 exit(ret); | |
| 271 } | |
| 272 } | |
| 273 | |
| 274 int main(int argc, char **argv) | |
| 275 { | |
| 276 char *fname = argc>1 ? argv[1] : "rmme.bcf"; | |
| 277 write_bcf(fname); | |
| 278 bcf_to_vcf(fname); | |
| 279 iterator(fname); | |
| 280 return 0; | |
| 281 } | |
| 282 |
