annotate bwa-0.6.2/bntseq.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 /* The MIT License
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 Copyright (c) 2008 Genome Research Ltd (GRL).
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 Permission is hereby granted, free of charge, to any person obtaining
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 a copy of this software and associated documentation files (the
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 "Software"), to deal in the Software without restriction, including
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 without limitation the rights to use, copy, modify, merge, publish,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 distribute, sublicense, and/or sell copies of the Software, and to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10 permit persons to whom the Software is furnished to do so, subject to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 the following conditions:
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 The above copyright notice and this permission notice shall be
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 included in all copies or substantial portions of the Software.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 SOFTWARE.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 #include <stdio.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 #include <stdlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30 #include <string.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 #include <zlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 #include <unistd.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 #include "bntseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 #include "main.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 #include "utils.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 #include "kseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 KSEQ_INIT(gzFile, gzread)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 unsigned char nst_nt4_table[256] = {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 };
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 void bns_dump(const bntseq_t *bns, const char *prefix)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 char str[1024];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 FILE *fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 { // dump .ann
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 strcpy(str, prefix); strcat(str, ".ann");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 fp = xopen(str, "w");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 for (i = 0; i != bns->n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 bntann1_t *p = bns->anns + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 fprintf(fp, "%d %s", p->gi, p->name);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 else fprintf(fp, "\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 fclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 { // dump .amb
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 strcpy(str, prefix); strcat(str, ".amb");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 fp = xopen(str, "w");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 for (i = 0; i != bns->n_holes; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 bntamb1_t *p = bns->ambs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 fclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 char str[1024];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92 FILE *fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 bntseq_t *bns;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 long long xx;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97 { // read .ann
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 fp = xopen(ann_filename, "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 bns->l_pac = xx;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102 for (i = 0; i < bns->n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 bntann1_t *p = bns->anns + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 char *q = str;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 int c;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106 // read gi and sequence name
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 fscanf(fp, "%u%s", &p->gi, str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 p->name = strdup(str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 // read fasta comments
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 *q = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 else p->anno = strdup("");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114 // read the rest
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115 fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 p->offset = xx;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118 fclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120 { // read .amb
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 int64_t l_pac;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122 int32_t n_seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 fp = xopen(amb_filename, "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125 l_pac = xx;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
128 for (i = 0; i < bns->n_holes; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
129 bntamb1_t *p = bns->ambs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
130 fscanf(fp, "%lld%d%s", &xx, &p->len, str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
131 p->offset = xx;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
132 p->amb = str[0];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
133 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
134 fclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
135 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
136 { // open .pac
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
137 bns->fp_pac = xopen(pac_filename, "rb");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
138 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
139 return bns;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
140 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
141
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
142 bntseq_t *bns_restore(const char *prefix)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
143 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
144 char ann_filename[1024], amb_filename[1024], pac_filename[1024];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
145 strcat(strcpy(ann_filename, prefix), ".ann");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
146 strcat(strcpy(amb_filename, prefix), ".amb");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
147 strcat(strcpy(pac_filename, prefix), ".pac");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
148 return bns_restore_core(ann_filename, amb_filename, pac_filename);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
149 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
150
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
151 void bns_destroy(bntseq_t *bns)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
152 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
153 if (bns == 0) return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
154 else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
155 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
156 if (bns->fp_pac) fclose(bns->fp_pac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
157 free(bns->ambs);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
158 for (i = 0; i < bns->n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
159 free(bns->anns[i].name);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
160 free(bns->anns[i].anno);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
161 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
162 free(bns->anns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
163 free(bns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
164 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
165 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
166
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
167 #define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
168 #define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
169
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
170 static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
171 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
172 bntann1_t *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
173 int i, lasts;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
174 if (bns->n_seqs == *m_seqs) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
175 *m_seqs <<= 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
176 bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
177 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
178 p = bns->anns + bns->n_seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
179 p->name = strdup((char*)seq->name.s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
180 p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
181 p->gi = 0; p->len = seq->seq.l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
182 p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
183 p->n_ambs = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
184 for (i = lasts = 0; i < seq->seq.l; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
185 int c = nst_nt4_table[(int)seq->seq.s[i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
186 if (c >= 4) { // N
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
187 if (lasts == seq->seq.s[i]) { // contiguous N
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
188 ++(*q)->len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
189 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
190 if (bns->n_holes == *m_holes) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
191 (*m_holes) <<= 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
192 bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
193 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
194 *q = bns->ambs + bns->n_holes;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
195 (*q)->len = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
196 (*q)->offset = p->offset + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
197 (*q)->amb = seq->seq.s[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
198 ++p->n_ambs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
199 ++bns->n_holes;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
200 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
201 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
202 lasts = seq->seq.s[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
203 { // fill buffer
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
204 if (c >= 4) c = lrand48()&3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
205 if (bns->l_pac == *m_pac) { // double the pac size
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
206 *m_pac <<= 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
207 pac = realloc(pac, *m_pac/4);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
208 memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
209 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
210 _set_pac(pac, bns->l_pac, c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
211 ++bns->l_pac;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
212 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
213 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
214 ++bns->n_seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
215 return pac;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
216 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
217
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
218 int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
219 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
220 extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
221 kseq_t *seq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
222 char name[1024];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
223 bntseq_t *bns;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
224 uint8_t *pac = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
225 int32_t m_seqs, m_holes;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
226 int64_t ret = -1, m_pac, l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
227 bntamb1_t *q;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
228 FILE *fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
229
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
230 // initialization
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
231 seq = kseq_init(fp_fa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
232 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
233 bns->seed = 11; // fixed seed for random generator
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
234 srand48(bns->seed);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
235 m_seqs = m_holes = 8; m_pac = 0x10000;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
236 bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
237 bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
238 pac = calloc(m_pac/4, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
239 q = bns->ambs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
240 strcpy(name, prefix); strcat(name, ".pac");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
241 fp = xopen(name, "wb");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
242 // read sequences
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
243 while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
244 if (!for_only) { // add the reverse complemented sequence
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
245 m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
246 pac = realloc(pac, m_pac/4);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
247 memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
248 for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
249 _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
250 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
251 ret = bns->l_pac;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
252 { // finalize .pac file
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
253 ubyte_t ct;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
254 fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
255 // the following codes make the pac file size always (l_pac/4+1+1)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
256 if (bns->l_pac % 4 == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
257 ct = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
258 fwrite(&ct, 1, 1, fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
259 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
260 ct = bns->l_pac % 4;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
261 fwrite(&ct, 1, 1, fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
262 // close .pac file
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
263 fclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
264 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
265 bns_dump(bns, prefix);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
266 bns_destroy(bns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
267 kseq_destroy(seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
268 free(pac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
269 return ret;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
270 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
271
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
272 int bwa_fa2pac(int argc, char *argv[])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
273 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
274 int c, for_only = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
275 gzFile fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
276 while ((c = getopt(argc, argv, "f")) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
277 switch (c) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
278 case 'f': for_only = 1; break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
279 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
280 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
281 if (argc == optind) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
282 fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
283 return 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
284 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
285 fp = xzopen(argv[optind], "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
286 bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
287 gzclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
288 return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
289 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
290
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
291 int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
292 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
293 int left, mid, right, nn;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
294 if (ref_id) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
295 left = 0; mid = 0; right = bns->n_seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
296 while (left < right) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
297 mid = (left + right) >> 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
298 if (pos_f >= bns->anns[mid].offset) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
299 if (mid == bns->n_seqs - 1) break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
300 if (pos_f < bns->anns[mid+1].offset) break; // bracketed
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
301 left = mid + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
302 } else right = mid;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
303 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
304 *ref_id = mid;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
305 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
306 left = 0; right = bns->n_holes; nn = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
307 while (left < right) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
308 mid = (left + right) >> 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
309 if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
310 else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
311 else { // overlap
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
312 if (pos_f >= bns->ambs[mid].offset) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
313 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
314 bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
315 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
316 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
317 bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
318 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
319 break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
320 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
321 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
322 return nn;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
323 }