comparison bwa-0.6.2/bntseq.c @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dd1186b11b3b
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <zlib.h>
32 #include <unistd.h>
33 #include "bntseq.h"
34 #include "main.h"
35 #include "utils.h"
36
37 #include "kseq.h"
38 KSEQ_INIT(gzFile, gzread)
39
40 unsigned char nst_nt4_table[256] = {
41 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
42 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
44 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
45 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
46 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
48 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
57 };
58
59 void bns_dump(const bntseq_t *bns, const char *prefix)
60 {
61 char str[1024];
62 FILE *fp;
63 int i;
64 { // dump .ann
65 strcpy(str, prefix); strcat(str, ".ann");
66 fp = xopen(str, "w");
67 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
68 for (i = 0; i != bns->n_seqs; ++i) {
69 bntann1_t *p = bns->anns + i;
70 fprintf(fp, "%d %s", p->gi, p->name);
71 if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
72 else fprintf(fp, "\n");
73 fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
74 }
75 fclose(fp);
76 }
77 { // dump .amb
78 strcpy(str, prefix); strcat(str, ".amb");
79 fp = xopen(str, "w");
80 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
81 for (i = 0; i != bns->n_holes; ++i) {
82 bntamb1_t *p = bns->ambs + i;
83 fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
84 }
85 fclose(fp);
86 }
87 }
88
89 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
90 {
91 char str[1024];
92 FILE *fp;
93 bntseq_t *bns;
94 long long xx;
95 int i;
96 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
97 { // read .ann
98 fp = xopen(ann_filename, "r");
99 fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
100 bns->l_pac = xx;
101 bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
102 for (i = 0; i < bns->n_seqs; ++i) {
103 bntann1_t *p = bns->anns + i;
104 char *q = str;
105 int c;
106 // read gi and sequence name
107 fscanf(fp, "%u%s", &p->gi, str);
108 p->name = strdup(str);
109 // read fasta comments
110 while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
111 *q = 0;
112 if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
113 else p->anno = strdup("");
114 // read the rest
115 fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
116 p->offset = xx;
117 }
118 fclose(fp);
119 }
120 { // read .amb
121 int64_t l_pac;
122 int32_t n_seqs;
123 fp = xopen(amb_filename, "r");
124 fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
125 l_pac = xx;
126 xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
127 bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
128 for (i = 0; i < bns->n_holes; ++i) {
129 bntamb1_t *p = bns->ambs + i;
130 fscanf(fp, "%lld%d%s", &xx, &p->len, str);
131 p->offset = xx;
132 p->amb = str[0];
133 }
134 fclose(fp);
135 }
136 { // open .pac
137 bns->fp_pac = xopen(pac_filename, "rb");
138 }
139 return bns;
140 }
141
142 bntseq_t *bns_restore(const char *prefix)
143 {
144 char ann_filename[1024], amb_filename[1024], pac_filename[1024];
145 strcat(strcpy(ann_filename, prefix), ".ann");
146 strcat(strcpy(amb_filename, prefix), ".amb");
147 strcat(strcpy(pac_filename, prefix), ".pac");
148 return bns_restore_core(ann_filename, amb_filename, pac_filename);
149 }
150
151 void bns_destroy(bntseq_t *bns)
152 {
153 if (bns == 0) return;
154 else {
155 int i;
156 if (bns->fp_pac) fclose(bns->fp_pac);
157 free(bns->ambs);
158 for (i = 0; i < bns->n_seqs; ++i) {
159 free(bns->anns[i].name);
160 free(bns->anns[i].anno);
161 }
162 free(bns->anns);
163 free(bns);
164 }
165 }
166
167 #define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
168 #define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
169
170 static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
171 {
172 bntann1_t *p;
173 int i, lasts;
174 if (bns->n_seqs == *m_seqs) {
175 *m_seqs <<= 1;
176 bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
177 }
178 p = bns->anns + bns->n_seqs;
179 p->name = strdup((char*)seq->name.s);
180 p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
181 p->gi = 0; p->len = seq->seq.l;
182 p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
183 p->n_ambs = 0;
184 for (i = lasts = 0; i < seq->seq.l; ++i) {
185 int c = nst_nt4_table[(int)seq->seq.s[i]];
186 if (c >= 4) { // N
187 if (lasts == seq->seq.s[i]) { // contiguous N
188 ++(*q)->len;
189 } else {
190 if (bns->n_holes == *m_holes) {
191 (*m_holes) <<= 1;
192 bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
193 }
194 *q = bns->ambs + bns->n_holes;
195 (*q)->len = 1;
196 (*q)->offset = p->offset + i;
197 (*q)->amb = seq->seq.s[i];
198 ++p->n_ambs;
199 ++bns->n_holes;
200 }
201 }
202 lasts = seq->seq.s[i];
203 { // fill buffer
204 if (c >= 4) c = lrand48()&3;
205 if (bns->l_pac == *m_pac) { // double the pac size
206 *m_pac <<= 1;
207 pac = realloc(pac, *m_pac/4);
208 memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
209 }
210 _set_pac(pac, bns->l_pac, c);
211 ++bns->l_pac;
212 }
213 }
214 ++bns->n_seqs;
215 return pac;
216 }
217
218 int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
219 {
220 extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
221 kseq_t *seq;
222 char name[1024];
223 bntseq_t *bns;
224 uint8_t *pac = 0;
225 int32_t m_seqs, m_holes;
226 int64_t ret = -1, m_pac, l;
227 bntamb1_t *q;
228 FILE *fp;
229
230 // initialization
231 seq = kseq_init(fp_fa);
232 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
233 bns->seed = 11; // fixed seed for random generator
234 srand48(bns->seed);
235 m_seqs = m_holes = 8; m_pac = 0x10000;
236 bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
237 bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
238 pac = calloc(m_pac/4, 1);
239 q = bns->ambs;
240 strcpy(name, prefix); strcat(name, ".pac");
241 fp = xopen(name, "wb");
242 // read sequences
243 while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
244 if (!for_only) { // add the reverse complemented sequence
245 m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
246 pac = realloc(pac, m_pac/4);
247 memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
248 for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
249 _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
250 }
251 ret = bns->l_pac;
252 { // finalize .pac file
253 ubyte_t ct;
254 fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
255 // the following codes make the pac file size always (l_pac/4+1+1)
256 if (bns->l_pac % 4 == 0) {
257 ct = 0;
258 fwrite(&ct, 1, 1, fp);
259 }
260 ct = bns->l_pac % 4;
261 fwrite(&ct, 1, 1, fp);
262 // close .pac file
263 fclose(fp);
264 }
265 bns_dump(bns, prefix);
266 bns_destroy(bns);
267 kseq_destroy(seq);
268 free(pac);
269 return ret;
270 }
271
272 int bwa_fa2pac(int argc, char *argv[])
273 {
274 int c, for_only = 0;
275 gzFile fp;
276 while ((c = getopt(argc, argv, "f")) >= 0) {
277 switch (c) {
278 case 'f': for_only = 1; break;
279 }
280 }
281 if (argc == optind) {
282 fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
283 return 1;
284 }
285 fp = xzopen(argv[optind], "r");
286 bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
287 gzclose(fp);
288 return 0;
289 }
290
291 int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
292 {
293 int left, mid, right, nn;
294 if (ref_id) {
295 left = 0; mid = 0; right = bns->n_seqs;
296 while (left < right) {
297 mid = (left + right) >> 1;
298 if (pos_f >= bns->anns[mid].offset) {
299 if (mid == bns->n_seqs - 1) break;
300 if (pos_f < bns->anns[mid+1].offset) break; // bracketed
301 left = mid + 1;
302 } else right = mid;
303 }
304 *ref_id = mid;
305 }
306 left = 0; right = bns->n_holes; nn = 0;
307 while (left < right) {
308 mid = (left + right) >> 1;
309 if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
310 else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
311 else { // overlap
312 if (pos_f >= bns->ambs[mid].offset) {
313 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
314 bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
315 } else {
316 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
317 bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
318 }
319 break;
320 }
321 }
322 return nn;
323 }