| 
0
 | 
     1 /*  test/test-vcf-api.c -- VCF test harness.
 | 
| 
 | 
     2 
 | 
| 
 | 
     3     Copyright (C) 2013, 2014 Genome Research Ltd.
 | 
| 
 | 
     4 
 | 
| 
 | 
     5     Author: Petr Danecek <pd3@sanger.ac.uk>
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
| 
 | 
     8 of this software and associated documentation files (the "Software"), to deal
 | 
| 
 | 
     9 in the Software without restriction, including without limitation the rights
 | 
| 
 | 
    10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
| 
 | 
    11 copies of the Software, and to permit persons to whom the Software is
 | 
| 
 | 
    12 furnished to do so, subject to the following conditions:
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 The above copyright notice and this permission notice shall be included in
 | 
| 
 | 
    15 all copies or substantial portions of the Software.
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
| 
 | 
    18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
| 
 | 
    19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 | 
| 
 | 
    20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
| 
 | 
    21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 | 
| 
 | 
    22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 | 
| 
 | 
    23 DEALINGS IN THE SOFTWARE.  */
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 #include <stdio.h>
 | 
| 
 | 
    26 #include <htslib/hts.h>
 | 
| 
 | 
    27 #include <htslib/vcf.h>
 | 
| 
 | 
    28 #include <htslib/kstring.h>
 | 
| 
 | 
    29 #include <htslib/kseq.h>
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 void write_bcf(char *fname)
 | 
| 
 | 
    32 {
 | 
| 
 | 
    33     // Init
 | 
| 
 | 
    34     htsFile *fp    = hts_open(fname,"wb");
 | 
| 
 | 
    35     bcf_hdr_t *hdr = bcf_hdr_init("w");
 | 
| 
 | 
    36     bcf1_t *rec    = bcf_init1();
 | 
| 
 | 
    37 
 | 
| 
 | 
    38     // Create VCF header
 | 
| 
 | 
    39     kstring_t str = {0,0,0};
 | 
| 
 | 
    40     bcf_hdr_append(hdr, "##fileDate=20090805");
 | 
| 
 | 
    41     bcf_hdr_append(hdr, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">");
 | 
| 
 | 
    42     bcf_hdr_append(hdr, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">");
 | 
| 
 | 
    43     bcf_hdr_append(hdr, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">");
 | 
| 
 | 
    44     bcf_hdr_append(hdr, "##unused=<XX=AA,Description=\"Unused generic\">");
 | 
| 
 | 
    45     bcf_hdr_append(hdr, "##unused=unformatted text 1");
 | 
| 
 | 
    46     bcf_hdr_append(hdr, "##unused=unformatted text 2");
 | 
| 
 | 
    47     bcf_hdr_append(hdr, "##contig=<ID=Unused,length=62435964>");
 | 
| 
 | 
    48     bcf_hdr_append(hdr, "##source=myImputationProgramV3.1");
 | 
| 
 | 
    49     bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta");
 | 
| 
 | 
    50     bcf_hdr_append(hdr, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>");
 | 
| 
 | 
    51     bcf_hdr_append(hdr, "##phasing=partial");
 | 
| 
 | 
    52     bcf_hdr_append(hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
 | 
| 
 | 
    53     bcf_hdr_append(hdr, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
 | 
| 
 | 
    54     bcf_hdr_append(hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
 | 
| 
 | 
    55     bcf_hdr_append(hdr, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">");
 | 
| 
 | 
    56     bcf_hdr_append(hdr, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">");
 | 
| 
 | 
    57     bcf_hdr_append(hdr, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">");
 | 
| 
 | 
    58     bcf_hdr_append(hdr, "##FILTER=<ID=q10,Description=\"Quality below 10\">");
 | 
| 
 | 
    59     bcf_hdr_append(hdr, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">");
 | 
| 
 | 
    60     bcf_hdr_append(hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
 | 
| 
 | 
    61     bcf_hdr_append(hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
 | 
| 
 | 
    62     bcf_hdr_append(hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
 | 
| 
 | 
    63     bcf_hdr_append(hdr, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
 | 
| 
 | 
    64     bcf_hdr_append(hdr, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">");
 | 
| 
 | 
    65 
 | 
| 
 | 
    66     bcf_hdr_add_sample(hdr, "NA00001");
 | 
| 
 | 
    67     bcf_hdr_add_sample(hdr, "NA00002");
 | 
| 
 | 
    68     bcf_hdr_add_sample(hdr, "NA00003");
 | 
| 
 | 
    69     bcf_hdr_add_sample(hdr, NULL);      // to update internal structures
 | 
| 
 | 
    70     bcf_hdr_write(fp, hdr);
 | 
| 
 | 
    71 
 | 
| 
 | 
    72 
 | 
| 
 | 
    73     // Add a record
 | 
| 
 | 
    74     // 20     14370   rs6054257 G      A       29   PASS   NS=3;DP=14;AF=0.5;DB;H2           GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
 | 
| 
 | 
    75     // .. CHROM
 | 
| 
 | 
    76     rec->rid = bcf_hdr_name2id(hdr, "20");
 | 
| 
 | 
    77     // .. POS
 | 
| 
 | 
    78     rec->pos = 14369;
 | 
| 
 | 
    79     // .. ID
 | 
| 
 | 
    80     bcf_update_id(hdr, rec, "rs6054257");
 | 
| 
 | 
    81     // .. REF and ALT
 | 
| 
 | 
    82     bcf_update_alleles_str(hdr, rec, "G,A");
 | 
| 
 | 
    83     // .. QUAL
 | 
| 
 | 
    84     rec->qual = 29;
 | 
| 
 | 
    85     // .. FILTER
 | 
| 
 | 
    86     int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS");
 | 
| 
 | 
    87     bcf_update_filter(hdr, rec, &tmpi, 1);
 | 
| 
 | 
    88     // .. INFO
 | 
| 
 | 
    89     tmpi = 3;
 | 
| 
 | 
    90     bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
 | 
| 
 | 
    91     tmpi = 14;
 | 
| 
 | 
    92     bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
 | 
| 
 | 
    93     float tmpf = 0.5;
 | 
| 
 | 
    94     bcf_update_info_float(hdr, rec, "AF", &tmpf, 1);
 | 
| 
 | 
    95     bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
 | 
| 
 | 
    96     bcf_update_info_flag(hdr, rec, "H2", NULL, 1);
 | 
| 
 | 
    97     // .. FORMAT
 | 
| 
 | 
    98     int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int));
 | 
| 
 | 
    99     tmpia[0] = bcf_gt_phased(0);
 | 
| 
 | 
   100     tmpia[1] = bcf_gt_phased(0);
 | 
| 
 | 
   101     tmpia[2] = bcf_gt_phased(1);
 | 
| 
 | 
   102     tmpia[3] = bcf_gt_phased(0);
 | 
| 
 | 
   103     tmpia[4] = bcf_gt_unphased(1);
 | 
| 
 | 
   104     tmpia[5] = bcf_gt_unphased(1);
 | 
| 
 | 
   105     bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
 | 
| 
 | 
   106     tmpia[0] = 48;
 | 
| 
 | 
   107     tmpia[1] = 48;
 | 
| 
 | 
   108     tmpia[2] = 43;
 | 
| 
 | 
   109     bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr));
 | 
| 
 | 
   110     tmpia[0] = 1;
 | 
| 
 | 
   111     tmpia[1] = 8;
 | 
| 
 | 
   112     tmpia[2] = 5;
 | 
| 
 | 
   113     bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr));
 | 
| 
 | 
   114     tmpia[0] = 51;
 | 
| 
 | 
   115     tmpia[1] = 51;
 | 
| 
 | 
   116     tmpia[2] = 51;
 | 
| 
 | 
   117     tmpia[3] = 51;
 | 
| 
 | 
   118     tmpia[4] = bcf_int32_missing;
 | 
| 
 | 
   119     tmpia[5] = bcf_int32_missing;
 | 
| 
 | 
   120     bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2);
 | 
| 
 | 
   121     char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"};
 | 
| 
 | 
   122     bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3);
 | 
| 
 | 
   123     bcf_write1(fp, hdr, rec);
 | 
| 
 | 
   124 
 | 
| 
 | 
   125     // 20     1110696 . A      G,T     67   .   NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1   ./.
 | 
| 
 | 
   126     bcf_clear1(rec);
 | 
| 
 | 
   127     rec->rid = bcf_hdr_name2id(hdr, "20");
 | 
| 
 | 
   128     rec->pos = 1110695;
 | 
| 
 | 
   129     bcf_update_alleles_str(hdr, rec, "A,G,T");
 | 
| 
 | 
   130     rec->qual = 67;
 | 
| 
 | 
   131     tmpi = 2;
 | 
| 
 | 
   132     bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
 | 
| 
 | 
   133     tmpi = 10;
 | 
| 
 | 
   134     bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
 | 
| 
 | 
   135     float *tmpfa = (float*)malloc(2*sizeof(float));
 | 
| 
 | 
   136     tmpfa[0] = 0.333;
 | 
| 
 | 
   137     bcf_float_set_missing(tmpfa[1]);
 | 
| 
 | 
   138     bcf_update_info_float(hdr, rec, "AF", tmpfa, 2);
 | 
| 
 | 
   139     bcf_update_info_string(hdr, rec, "AA", "T");
 | 
| 
 | 
   140     bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
 | 
| 
 | 
   141     tmpia[0] = bcf_gt_phased(2);
 | 
| 
 | 
   142     tmpia[1] = bcf_int32_vector_end;
 | 
| 
 | 
   143     tmpia[2] = bcf_gt_phased(1);
 | 
| 
 | 
   144     tmpia[3] = bcf_int32_vector_end;
 | 
| 
 | 
   145     tmpia[4] = bcf_gt_missing;
 | 
| 
 | 
   146     tmpia[5] = bcf_gt_missing;
 | 
| 
 | 
   147     bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
 | 
| 
 | 
   148     bcf_write1(fp, hdr, rec);
 | 
| 
 | 
   149 
 | 
| 
 | 
   150     free(tmpia);
 | 
| 
 | 
   151     free(tmpfa);
 | 
| 
 | 
   152 
 | 
| 
 | 
   153     // Clean
 | 
| 
 | 
   154     free(str.s);
 | 
| 
 | 
   155     bcf_destroy1(rec);
 | 
| 
 | 
   156     bcf_hdr_destroy(hdr);
 | 
| 
 | 
   157     int ret;
 | 
| 
 | 
   158     if ( (ret=hts_close(fp)) )
 | 
| 
 | 
   159     {
 | 
| 
 | 
   160         fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
 | 
| 
 | 
   161         exit(ret);
 | 
| 
 | 
   162     }
 | 
| 
 | 
   163 }
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 void bcf_to_vcf(char *fname)
 | 
| 
 | 
   166 {
 | 
| 
 | 
   167     htsFile *fp    = hts_open(fname,"rb");
 | 
| 
 | 
   168     bcf_hdr_t *hdr = bcf_hdr_read(fp);
 | 
| 
 | 
   169     bcf1_t *rec    = bcf_init1();
 | 
| 
 | 
   170 
 | 
| 
 | 
   171     char *gz_fname = (char*) malloc(strlen(fname)+4);
 | 
| 
 | 
   172     snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname);
 | 
| 
 | 
   173     htsFile *out   = hts_open(gz_fname,"wg");
 | 
| 
 | 
   174 
 | 
| 
 | 
   175     bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr);
 | 
| 
 | 
   176     bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused");
 | 
| 
 | 
   177     bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused");
 | 
| 
 | 
   178     bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt");
 | 
| 
 | 
   179     bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI");
 | 
| 
 | 
   180     bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF");
 | 
| 
 | 
   181     bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused");
 | 
| 
 | 
   182     bcf_hdr_write(out, hdr_out);
 | 
| 
 | 
   183 
 | 
| 
 | 
   184     while ( bcf_read1(fp, hdr, rec)>=0 )
 | 
| 
 | 
   185     {
 | 
| 
 | 
   186         bcf_write1(out, hdr_out, rec);
 | 
| 
 | 
   187 
 | 
| 
 | 
   188         // Test problems caused by bcf1_sync: the data block
 | 
| 
 | 
   189         // may be realloced, also the unpacked structures must
 | 
| 
 | 
   190         // get updated.
 | 
| 
 | 
   191         bcf_unpack(rec, BCF_UN_STR);
 | 
| 
 | 
   192         bcf_update_id(hdr, rec, 0);
 | 
| 
 | 
   193         bcf_update_format_int32(hdr, rec, "GQ", NULL, 0);
 | 
| 
 | 
   194 
 | 
| 
 | 
   195         bcf1_t *dup = bcf_dup(rec);     // force bcf1_sync call
 | 
| 
 | 
   196         bcf_write1(out, hdr_out, dup);
 | 
| 
 | 
   197         bcf_destroy1(dup);
 | 
| 
 | 
   198 
 | 
| 
 | 
   199         bcf_update_alleles_str(hdr_out, rec, "G,A");
 | 
| 
 | 
   200         int32_t tmpi = 99;
 | 
| 
 | 
   201         bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1);
 | 
| 
 | 
   202         int32_t tmpia[] = {9,9,9};
 | 
| 
 | 
   203         bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3);
 | 
| 
 | 
   204 
 | 
| 
 | 
   205         bcf_write1(out, hdr_out, rec);
 | 
| 
 | 
   206     }
 | 
| 
 | 
   207 
 | 
| 
 | 
   208     bcf_destroy1(rec);
 | 
| 
 | 
   209     bcf_hdr_destroy(hdr);
 | 
| 
 | 
   210     bcf_hdr_destroy(hdr_out);
 | 
| 
 | 
   211     int ret;
 | 
| 
 | 
   212     if ( (ret=hts_close(fp)) )
 | 
| 
 | 
   213     {
 | 
| 
 | 
   214         fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
 | 
| 
 | 
   215         exit(ret);
 | 
| 
 | 
   216     }
 | 
| 
 | 
   217     if ( (ret=hts_close(out)) )
 | 
| 
 | 
   218     {
 | 
| 
 | 
   219         fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
 | 
| 
 | 
   220         exit(ret);
 | 
| 
 | 
   221     }
 | 
| 
 | 
   222 
 | 
| 
 | 
   223 
 | 
| 
 | 
   224     // read gzip, write stdout
 | 
| 
 | 
   225     htsFile *gz_in = hts_open(gz_fname, "r");
 | 
| 
 | 
   226     if ( !gz_in )
 | 
| 
 | 
   227     {
 | 
| 
 | 
   228         fprintf(stderr,"Could not read: %s\n", gz_fname);
 | 
| 
 | 
   229         exit(1);
 | 
| 
 | 
   230     }
 | 
| 
 | 
   231 
 | 
| 
 | 
   232     kstring_t line = {0,0,0};
 | 
| 
 | 
   233     while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 )
 | 
| 
 | 
   234     {
 | 
| 
 | 
   235         kputc('\n',&line);
 | 
| 
 | 
   236         fwrite(line.s,1,line.l,stdout);
 | 
| 
 | 
   237     }
 | 
| 
 | 
   238 
 | 
| 
 | 
   239     if ( (ret=hts_close(gz_in)) )
 | 
| 
 | 
   240     {
 | 
| 
 | 
   241         fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
 | 
| 
 | 
   242         exit(ret);
 | 
| 
 | 
   243     }
 | 
| 
 | 
   244     free(line.s);
 | 
| 
 | 
   245     free(gz_fname);
 | 
| 
 | 
   246 }
 | 
| 
 | 
   247 
 | 
| 
 | 
   248 void iterator(const char *fname)
 | 
| 
 | 
   249 {
 | 
| 
 | 
   250     htsFile *fp = hts_open(fname, "r");
 | 
| 
 | 
   251     bcf_hdr_t *hdr = bcf_hdr_read(fp);
 | 
| 
 | 
   252     hts_idx_t *idx;
 | 
| 
 | 
   253     hts_itr_t *iter;
 | 
| 
 | 
   254 
 | 
| 
 | 
   255     bcf_index_build(fname, 0);
 | 
| 
 | 
   256     idx = bcf_index_load(fname);
 | 
| 
 | 
   257 
 | 
| 
 | 
   258     iter = bcf_itr_queryi(idx, bcf_hdr_name2id(hdr, "20"), 1110600, 1110800);
 | 
| 
 | 
   259     bcf_itr_destroy(iter);
 | 
| 
 | 
   260 
 | 
| 
 | 
   261     iter = bcf_itr_querys(idx, hdr, "20:1110600-1110800");
 | 
| 
 | 
   262     bcf_itr_destroy(iter);
 | 
| 
 | 
   263 
 | 
| 
 | 
   264     hts_idx_destroy(idx);
 | 
| 
 | 
   265     bcf_hdr_destroy(hdr);
 | 
| 
 | 
   266     int ret;
 | 
| 
 | 
   267     if ( (ret=hts_close(fp)) )
 | 
| 
 | 
   268     {
 | 
| 
 | 
   269         fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
 | 
| 
 | 
   270         exit(ret);
 | 
| 
 | 
   271     }
 | 
| 
 | 
   272 }
 | 
| 
 | 
   273 
 | 
| 
 | 
   274 int main(int argc, char **argv)
 | 
| 
 | 
   275 {
 | 
| 
 | 
   276     char *fname = argc>1 ? argv[1] : "rmme.bcf";
 | 
| 
 | 
   277     write_bcf(fname);
 | 
| 
 | 
   278     bcf_to_vcf(fname);
 | 
| 
 | 
   279     iterator(fname);
 | 
| 
 | 
   280     return 0;
 | 
| 
 | 
   281 }
 | 
| 
 | 
   282 
 |