comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bntseq.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <zlib.h>
32 #include "bntseq.h"
33 #include "main.h"
34 #include "utils.h"
35
36 #include "kseq.h"
37 KSEQ_INIT(gzFile, gzread)
38
39 unsigned char nst_nt4_table[256] = {
40 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
41 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
42 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
44 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
45 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
46 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
47 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
56 };
57
58 void bns_dump(const bntseq_t *bns, const char *prefix)
59 {
60 char str[1024];
61 FILE *fp;
62 int i;
63 { // dump .ann
64 strcpy(str, prefix); strcat(str, ".ann");
65 fp = xopen(str, "w");
66 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
67 for (i = 0; i != bns->n_seqs; ++i) {
68 bntann1_t *p = bns->anns + i;
69 fprintf(fp, "%d %s", p->gi, p->name);
70 if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
71 else fprintf(fp, "\n");
72 fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
73 }
74 fclose(fp);
75 }
76 { // dump .amb
77 strcpy(str, prefix); strcat(str, ".amb");
78 fp = xopen(str, "w");
79 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
80 for (i = 0; i != bns->n_holes; ++i) {
81 bntamb1_t *p = bns->ambs + i;
82 fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
83 }
84 fclose(fp);
85 }
86 }
87
88 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
89 {
90 char str[1024];
91 FILE *fp;
92 bntseq_t *bns;
93 long long xx;
94 int i;
95 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
96 { // read .ann
97 fp = xopen(ann_filename, "r");
98 fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
99 bns->l_pac = xx;
100 bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
101 for (i = 0; i < bns->n_seqs; ++i) {
102 bntann1_t *p = bns->anns + i;
103 char *q = str;
104 int c;
105 // read gi and sequence name
106 fscanf(fp, "%u%s", &p->gi, str);
107 p->name = strdup(str);
108 // read fasta comments
109 while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
110 *q = 0;
111 if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
112 else p->anno = strdup("");
113 // read the rest
114 fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
115 p->offset = xx;
116 }
117 fclose(fp);
118 }
119 { // read .amb
120 int64_t l_pac;
121 int32_t n_seqs;
122 fp = xopen(amb_filename, "r");
123 fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
124 l_pac = xx;
125 xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
126 bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
127 for (i = 0; i < bns->n_holes; ++i) {
128 bntamb1_t *p = bns->ambs + i;
129 fscanf(fp, "%lld%d%s", &xx, &p->len, str);
130 p->offset = xx;
131 p->amb = str[0];
132 }
133 fclose(fp);
134 }
135 { // open .pac
136 bns->fp_pac = xopen(pac_filename, "rb");
137 }
138 return bns;
139 }
140
141 bntseq_t *bns_restore(const char *prefix)
142 {
143 char ann_filename[1024], amb_filename[1024], pac_filename[1024];
144 strcat(strcpy(ann_filename, prefix), ".ann");
145 strcat(strcpy(amb_filename, prefix), ".amb");
146 strcat(strcpy(pac_filename, prefix), ".pac");
147 return bns_restore_core(ann_filename, amb_filename, pac_filename);
148 }
149
150 void bns_destroy(bntseq_t *bns)
151 {
152 if (bns == 0) return;
153 else {
154 int i;
155 if (bns->fp_pac) fclose(bns->fp_pac);
156 free(bns->ambs);
157 for (i = 0; i < bns->n_seqs; ++i) {
158 free(bns->anns[i].name);
159 free(bns->anns[i].anno);
160 }
161 free(bns->anns);
162 free(bns);
163 }
164 }
165
166 void bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
167 {
168 kseq_t *seq;
169 char name[1024];
170 bntseq_t *bns;
171 bntamb1_t *q;
172 int l_buf;
173 unsigned char buf[0x10000];
174 int32_t m_seqs, m_holes, l, i;
175 FILE *fp;
176
177 // initialization
178 seq = kseq_init(fp_fa);
179 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
180 bns->seed = 11; // fixed seed for random generator
181 srand48(bns->seed);
182 m_seqs = m_holes = 8;
183 bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
184 bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
185 q = bns->ambs;
186 l_buf = 0;
187 strcpy(name, prefix); strcat(name, ".pac");
188 fp = xopen(name, "wb");
189 memset(buf, 0, 0x10000);
190 // read sequences
191 while ((l = kseq_read(seq)) >= 0) {
192 bntann1_t *p;
193 int lasts;
194 if (bns->n_seqs == m_seqs) {
195 m_seqs <<= 1;
196 bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
197 }
198 p = bns->anns + bns->n_seqs;
199 p->name = strdup((char*)seq->name.s);
200 p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
201 p->gi = 0; p->len = l;
202 p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
203 p->n_ambs = 0;
204 for (i = 0, lasts = 0; i < l; ++i) {
205 int c = nst_nt4_table[(int)seq->seq.s[i]];
206 if (c >= 4) { // N
207 if (lasts == seq->seq.s[i]) { // contiguous N
208 ++q->len;
209 } else {
210 if (bns->n_holes == m_holes) {
211 m_holes <<= 1;
212 bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
213 }
214 q = bns->ambs + bns->n_holes;
215 q->len = 1;
216 q->offset = p->offset + i;
217 q->amb = seq->seq.s[i];
218 ++p->n_ambs;
219 ++bns->n_holes;
220 }
221 }
222 lasts = seq->seq.s[i];
223 { // fill buffer
224 if (c >= 4) c = lrand48()&0x3;
225 if (l_buf == 0x40000) {
226 fwrite(buf, 1, 0x10000, fp);
227 memset(buf, 0, 0x10000);
228 l_buf = 0;
229 }
230 buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
231 ++l_buf;
232 }
233 }
234 ++bns->n_seqs;
235 bns->l_pac += seq->seq.l;
236 }
237 xassert(bns->l_pac, "zero length sequence.");
238 { // finalize .pac file
239 ubyte_t ct;
240 fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
241 // the following codes make the pac file size always (l_pac/4+1+1)
242 if (bns->l_pac % 4 == 0) {
243 ct = 0;
244 fwrite(&ct, 1, 1, fp);
245 }
246 ct = bns->l_pac % 4;
247 fwrite(&ct, 1, 1, fp);
248 // close .pac file
249 fclose(fp);
250 }
251 bns_dump(bns, prefix);
252 bns_destroy(bns);
253 kseq_destroy(seq);
254 }
255
256 int bwa_fa2pac(int argc, char *argv[])
257 {
258 gzFile fp;
259 if (argc < 2) {
260 fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
261 return 1;
262 }
263 fp = xzopen(argv[1], "r");
264 bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
265 gzclose(fp);
266 return 0;
267 }
268
269 int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
270 {
271 int left, mid, right, nn;
272 if (pac_coor >= bns->l_pac)
273 err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
274 // binary search for the sequence ID. Note that this is a bit different from the following one...
275 left = 0; mid = 0; right = bns->n_seqs;
276 while (left < right) {
277 mid = (left + right) >> 1;
278 if (pac_coor >= bns->anns[mid].offset) {
279 if (mid == bns->n_seqs - 1) break;
280 if (pac_coor < bns->anns[mid+1].offset) break;
281 left = mid + 1;
282 } else right = mid;
283 }
284 *real_seq = mid;
285 // binary search for holes
286 left = 0; right = bns->n_holes; nn = 0;
287 while (left < right) {
288 int64_t mid = (left + right) >> 1;
289 if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
290 else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
291 else { // overlap
292 if (pac_coor >= bns->ambs[mid].offset) {
293 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
294 bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
295 } else {
296 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
297 bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
298 }
299 break;
300 }
301 }
302 return nn;
303 }