Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/bntseq.c @ 2:a294fbfcb1db draft default tip
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:59 -0400 |
parents | dd1186b11b3b |
children |
comparison
equal
deleted
inserted
replaced
1:a9636dc1e99a | 2:a294fbfcb1db |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2008 Genome Research Ltd (GRL). | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
27 | |
28 #include <stdio.h> | |
29 #include <stdlib.h> | |
30 #include <string.h> | |
31 #include <zlib.h> | |
32 #include <unistd.h> | |
33 #include "bntseq.h" | |
34 #include "main.h" | |
35 #include "utils.h" | |
36 | |
37 #include "kseq.h" | |
38 KSEQ_INIT(gzFile, gzread) | |
39 | |
40 unsigned char nst_nt4_table[256] = { | |
41 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
42 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, | |
44 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
45 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
46 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
47 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
48 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
56 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
57 }; | |
58 | |
59 void bns_dump(const bntseq_t *bns, const char *prefix) | |
60 { | |
61 char str[1024]; | |
62 FILE *fp; | |
63 int i; | |
64 { // dump .ann | |
65 strcpy(str, prefix); strcat(str, ".ann"); | |
66 fp = xopen(str, "w"); | |
67 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); | |
68 for (i = 0; i != bns->n_seqs; ++i) { | |
69 bntann1_t *p = bns->anns + i; | |
70 fprintf(fp, "%d %s", p->gi, p->name); | |
71 if (p->anno[0]) fprintf(fp, " %s\n", p->anno); | |
72 else fprintf(fp, "\n"); | |
73 fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); | |
74 } | |
75 fclose(fp); | |
76 } | |
77 { // dump .amb | |
78 strcpy(str, prefix); strcat(str, ".amb"); | |
79 fp = xopen(str, "w"); | |
80 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); | |
81 for (i = 0; i != bns->n_holes; ++i) { | |
82 bntamb1_t *p = bns->ambs + i; | |
83 fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); | |
84 } | |
85 fclose(fp); | |
86 } | |
87 } | |
88 | |
89 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) | |
90 { | |
91 char str[1024]; | |
92 FILE *fp; | |
93 bntseq_t *bns; | |
94 long long xx; | |
95 int i; | |
96 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); | |
97 { // read .ann | |
98 fp = xopen(ann_filename, "r"); | |
99 fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); | |
100 bns->l_pac = xx; | |
101 bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); | |
102 for (i = 0; i < bns->n_seqs; ++i) { | |
103 bntann1_t *p = bns->anns + i; | |
104 char *q = str; | |
105 int c; | |
106 // read gi and sequence name | |
107 fscanf(fp, "%u%s", &p->gi, str); | |
108 p->name = strdup(str); | |
109 // read fasta comments | |
110 while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; | |
111 *q = 0; | |
112 if (q - str > 1) p->anno = strdup(str + 1); // skip leading space | |
113 else p->anno = strdup(""); | |
114 // read the rest | |
115 fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); | |
116 p->offset = xx; | |
117 } | |
118 fclose(fp); | |
119 } | |
120 { // read .amb | |
121 int64_t l_pac; | |
122 int32_t n_seqs; | |
123 fp = xopen(amb_filename, "r"); | |
124 fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); | |
125 l_pac = xx; | |
126 xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); | |
127 bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); | |
128 for (i = 0; i < bns->n_holes; ++i) { | |
129 bntamb1_t *p = bns->ambs + i; | |
130 fscanf(fp, "%lld%d%s", &xx, &p->len, str); | |
131 p->offset = xx; | |
132 p->amb = str[0]; | |
133 } | |
134 fclose(fp); | |
135 } | |
136 { // open .pac | |
137 bns->fp_pac = xopen(pac_filename, "rb"); | |
138 } | |
139 return bns; | |
140 } | |
141 | |
142 bntseq_t *bns_restore(const char *prefix) | |
143 { | |
144 char ann_filename[1024], amb_filename[1024], pac_filename[1024]; | |
145 strcat(strcpy(ann_filename, prefix), ".ann"); | |
146 strcat(strcpy(amb_filename, prefix), ".amb"); | |
147 strcat(strcpy(pac_filename, prefix), ".pac"); | |
148 return bns_restore_core(ann_filename, amb_filename, pac_filename); | |
149 } | |
150 | |
151 void bns_destroy(bntseq_t *bns) | |
152 { | |
153 if (bns == 0) return; | |
154 else { | |
155 int i; | |
156 if (bns->fp_pac) fclose(bns->fp_pac); | |
157 free(bns->ambs); | |
158 for (i = 0; i < bns->n_seqs; ++i) { | |
159 free(bns->anns[i].name); | |
160 free(bns->anns[i].anno); | |
161 } | |
162 free(bns->anns); | |
163 free(bns); | |
164 } | |
165 } | |
166 | |
167 #define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) | |
168 #define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) | |
169 | |
170 static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) | |
171 { | |
172 bntann1_t *p; | |
173 int i, lasts; | |
174 if (bns->n_seqs == *m_seqs) { | |
175 *m_seqs <<= 1; | |
176 bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); | |
177 } | |
178 p = bns->anns + bns->n_seqs; | |
179 p->name = strdup((char*)seq->name.s); | |
180 p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); | |
181 p->gi = 0; p->len = seq->seq.l; | |
182 p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; | |
183 p->n_ambs = 0; | |
184 for (i = lasts = 0; i < seq->seq.l; ++i) { | |
185 int c = nst_nt4_table[(int)seq->seq.s[i]]; | |
186 if (c >= 4) { // N | |
187 if (lasts == seq->seq.s[i]) { // contiguous N | |
188 ++(*q)->len; | |
189 } else { | |
190 if (bns->n_holes == *m_holes) { | |
191 (*m_holes) <<= 1; | |
192 bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); | |
193 } | |
194 *q = bns->ambs + bns->n_holes; | |
195 (*q)->len = 1; | |
196 (*q)->offset = p->offset + i; | |
197 (*q)->amb = seq->seq.s[i]; | |
198 ++p->n_ambs; | |
199 ++bns->n_holes; | |
200 } | |
201 } | |
202 lasts = seq->seq.s[i]; | |
203 { // fill buffer | |
204 if (c >= 4) c = lrand48()&3; | |
205 if (bns->l_pac == *m_pac) { // double the pac size | |
206 *m_pac <<= 1; | |
207 pac = realloc(pac, *m_pac/4); | |
208 memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); | |
209 } | |
210 _set_pac(pac, bns->l_pac, c); | |
211 ++bns->l_pac; | |
212 } | |
213 } | |
214 ++bns->n_seqs; | |
215 return pac; | |
216 } | |
217 | |
218 int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) | |
219 { | |
220 extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c | |
221 kseq_t *seq; | |
222 char name[1024]; | |
223 bntseq_t *bns; | |
224 uint8_t *pac = 0; | |
225 int32_t m_seqs, m_holes; | |
226 int64_t ret = -1, m_pac, l; | |
227 bntamb1_t *q; | |
228 FILE *fp; | |
229 | |
230 // initialization | |
231 seq = kseq_init(fp_fa); | |
232 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); | |
233 bns->seed = 11; // fixed seed for random generator | |
234 srand48(bns->seed); | |
235 m_seqs = m_holes = 8; m_pac = 0x10000; | |
236 bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); | |
237 bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); | |
238 pac = calloc(m_pac/4, 1); | |
239 q = bns->ambs; | |
240 strcpy(name, prefix); strcat(name, ".pac"); | |
241 fp = xopen(name, "wb"); | |
242 // read sequences | |
243 while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); | |
244 if (!for_only) { // add the reverse complemented sequence | |
245 m_pac = (bns->l_pac * 2 + 3) / 4 * 4; | |
246 pac = realloc(pac, m_pac/4); | |
247 memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); | |
248 for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) | |
249 _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); | |
250 } | |
251 ret = bns->l_pac; | |
252 { // finalize .pac file | |
253 ubyte_t ct; | |
254 fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); | |
255 // the following codes make the pac file size always (l_pac/4+1+1) | |
256 if (bns->l_pac % 4 == 0) { | |
257 ct = 0; | |
258 fwrite(&ct, 1, 1, fp); | |
259 } | |
260 ct = bns->l_pac % 4; | |
261 fwrite(&ct, 1, 1, fp); | |
262 // close .pac file | |
263 fclose(fp); | |
264 } | |
265 bns_dump(bns, prefix); | |
266 bns_destroy(bns); | |
267 kseq_destroy(seq); | |
268 free(pac); | |
269 return ret; | |
270 } | |
271 | |
272 int bwa_fa2pac(int argc, char *argv[]) | |
273 { | |
274 int c, for_only = 0; | |
275 gzFile fp; | |
276 while ((c = getopt(argc, argv, "f")) >= 0) { | |
277 switch (c) { | |
278 case 'f': for_only = 1; break; | |
279 } | |
280 } | |
281 if (argc == optind) { | |
282 fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n"); | |
283 return 1; | |
284 } | |
285 fp = xzopen(argv[optind], "r"); | |
286 bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); | |
287 gzclose(fp); | |
288 return 0; | |
289 } | |
290 | |
291 int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) | |
292 { | |
293 int left, mid, right, nn; | |
294 if (ref_id) { | |
295 left = 0; mid = 0; right = bns->n_seqs; | |
296 while (left < right) { | |
297 mid = (left + right) >> 1; | |
298 if (pos_f >= bns->anns[mid].offset) { | |
299 if (mid == bns->n_seqs - 1) break; | |
300 if (pos_f < bns->anns[mid+1].offset) break; // bracketed | |
301 left = mid + 1; | |
302 } else right = mid; | |
303 } | |
304 *ref_id = mid; | |
305 } | |
306 left = 0; right = bns->n_holes; nn = 0; | |
307 while (left < right) { | |
308 mid = (left + right) >> 1; | |
309 if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; | |
310 else if (pos_f + len <= bns->ambs[mid].offset) right = mid; | |
311 else { // overlap | |
312 if (pos_f >= bns->ambs[mid].offset) { | |
313 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? | |
314 bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; | |
315 } else { | |
316 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? | |
317 bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); | |
318 } | |
319 break; | |
320 } | |
321 } | |
322 return nn; | |
323 } |