Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bntseq.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2008 Genome Research Ltd (GRL). | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
27 | |
28 #include <stdio.h> | |
29 #include <stdlib.h> | |
30 #include <string.h> | |
31 #include <zlib.h> | |
32 #include "bntseq.h" | |
33 #include "main.h" | |
34 #include "utils.h" | |
35 | |
36 #include "kseq.h" | |
37 KSEQ_INIT(gzFile, gzread) | |
38 | |
39 unsigned char nst_nt4_table[256] = { | |
40 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
41 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
42 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, | |
43 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
44 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
45 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
46 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
47 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
49 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
50 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
51 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
52 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
53 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
54 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
55 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 | |
56 }; | |
57 | |
58 void bns_dump(const bntseq_t *bns, const char *prefix) | |
59 { | |
60 char str[1024]; | |
61 FILE *fp; | |
62 int i; | |
63 { // dump .ann | |
64 strcpy(str, prefix); strcat(str, ".ann"); | |
65 fp = xopen(str, "w"); | |
66 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); | |
67 for (i = 0; i != bns->n_seqs; ++i) { | |
68 bntann1_t *p = bns->anns + i; | |
69 fprintf(fp, "%d %s", p->gi, p->name); | |
70 if (p->anno[0]) fprintf(fp, " %s\n", p->anno); | |
71 else fprintf(fp, "\n"); | |
72 fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); | |
73 } | |
74 fclose(fp); | |
75 } | |
76 { // dump .amb | |
77 strcpy(str, prefix); strcat(str, ".amb"); | |
78 fp = xopen(str, "w"); | |
79 fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); | |
80 for (i = 0; i != bns->n_holes; ++i) { | |
81 bntamb1_t *p = bns->ambs + i; | |
82 fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); | |
83 } | |
84 fclose(fp); | |
85 } | |
86 } | |
87 | |
88 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) | |
89 { | |
90 char str[1024]; | |
91 FILE *fp; | |
92 bntseq_t *bns; | |
93 long long xx; | |
94 int i; | |
95 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); | |
96 { // read .ann | |
97 fp = xopen(ann_filename, "r"); | |
98 fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); | |
99 bns->l_pac = xx; | |
100 bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); | |
101 for (i = 0; i < bns->n_seqs; ++i) { | |
102 bntann1_t *p = bns->anns + i; | |
103 char *q = str; | |
104 int c; | |
105 // read gi and sequence name | |
106 fscanf(fp, "%u%s", &p->gi, str); | |
107 p->name = strdup(str); | |
108 // read fasta comments | |
109 while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; | |
110 *q = 0; | |
111 if (q - str > 1) p->anno = strdup(str + 1); // skip leading space | |
112 else p->anno = strdup(""); | |
113 // read the rest | |
114 fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); | |
115 p->offset = xx; | |
116 } | |
117 fclose(fp); | |
118 } | |
119 { // read .amb | |
120 int64_t l_pac; | |
121 int32_t n_seqs; | |
122 fp = xopen(amb_filename, "r"); | |
123 fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); | |
124 l_pac = xx; | |
125 xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); | |
126 bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); | |
127 for (i = 0; i < bns->n_holes; ++i) { | |
128 bntamb1_t *p = bns->ambs + i; | |
129 fscanf(fp, "%lld%d%s", &xx, &p->len, str); | |
130 p->offset = xx; | |
131 p->amb = str[0]; | |
132 } | |
133 fclose(fp); | |
134 } | |
135 { // open .pac | |
136 bns->fp_pac = xopen(pac_filename, "rb"); | |
137 } | |
138 return bns; | |
139 } | |
140 | |
141 bntseq_t *bns_restore(const char *prefix) | |
142 { | |
143 char ann_filename[1024], amb_filename[1024], pac_filename[1024]; | |
144 strcat(strcpy(ann_filename, prefix), ".ann"); | |
145 strcat(strcpy(amb_filename, prefix), ".amb"); | |
146 strcat(strcpy(pac_filename, prefix), ".pac"); | |
147 return bns_restore_core(ann_filename, amb_filename, pac_filename); | |
148 } | |
149 | |
150 void bns_destroy(bntseq_t *bns) | |
151 { | |
152 if (bns == 0) return; | |
153 else { | |
154 int i; | |
155 if (bns->fp_pac) fclose(bns->fp_pac); | |
156 free(bns->ambs); | |
157 for (i = 0; i < bns->n_seqs; ++i) { | |
158 free(bns->anns[i].name); | |
159 free(bns->anns[i].anno); | |
160 } | |
161 free(bns->anns); | |
162 free(bns); | |
163 } | |
164 } | |
165 | |
166 void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) | |
167 { | |
168 kseq_t *seq; | |
169 char name[1024]; | |
170 bntseq_t *bns; | |
171 bntamb1_t *q; | |
172 int l_buf; | |
173 unsigned char buf[0x10000]; | |
174 int32_t m_seqs, m_holes, l, i; | |
175 FILE *fp; | |
176 | |
177 // initialization | |
178 seq = kseq_init(fp_fa); | |
179 bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); | |
180 bns->seed = 11; // fixed seed for random generator | |
181 srand48(bns->seed); | |
182 m_seqs = m_holes = 8; | |
183 bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); | |
184 bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); | |
185 q = bns->ambs; | |
186 l_buf = 0; | |
187 strcpy(name, prefix); strcat(name, ".pac"); | |
188 fp = xopen(name, "wb"); | |
189 memset(buf, 0, 0x10000); | |
190 // read sequences | |
191 while ((l = kseq_read(seq)) >= 0) { | |
192 bntann1_t *p; | |
193 int lasts; | |
194 if (bns->n_seqs == m_seqs) { | |
195 m_seqs <<= 1; | |
196 bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t)); | |
197 } | |
198 p = bns->anns + bns->n_seqs; | |
199 p->name = strdup((char*)seq->name.s); | |
200 p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); | |
201 p->gi = 0; p->len = l; | |
202 p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; | |
203 p->n_ambs = 0; | |
204 for (i = 0, lasts = 0; i < l; ++i) { | |
205 int c = nst_nt4_table[(int)seq->seq.s[i]]; | |
206 if (c >= 4) { // N | |
207 if (lasts == seq->seq.s[i]) { // contiguous N | |
208 ++q->len; | |
209 } else { | |
210 if (bns->n_holes == m_holes) { | |
211 m_holes <<= 1; | |
212 bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t)); | |
213 } | |
214 q = bns->ambs + bns->n_holes; | |
215 q->len = 1; | |
216 q->offset = p->offset + i; | |
217 q->amb = seq->seq.s[i]; | |
218 ++p->n_ambs; | |
219 ++bns->n_holes; | |
220 } | |
221 } | |
222 lasts = seq->seq.s[i]; | |
223 { // fill buffer | |
224 if (c >= 4) c = lrand48()&0x3; | |
225 if (l_buf == 0x40000) { | |
226 fwrite(buf, 1, 0x10000, fp); | |
227 memset(buf, 0, 0x10000); | |
228 l_buf = 0; | |
229 } | |
230 buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1); | |
231 ++l_buf; | |
232 } | |
233 } | |
234 ++bns->n_seqs; | |
235 bns->l_pac += seq->seq.l; | |
236 } | |
237 xassert(bns->l_pac, "zero length sequence."); | |
238 { // finalize .pac file | |
239 ubyte_t ct; | |
240 fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); | |
241 // the following codes make the pac file size always (l_pac/4+1+1) | |
242 if (bns->l_pac % 4 == 0) { | |
243 ct = 0; | |
244 fwrite(&ct, 1, 1, fp); | |
245 } | |
246 ct = bns->l_pac % 4; | |
247 fwrite(&ct, 1, 1, fp); | |
248 // close .pac file | |
249 fclose(fp); | |
250 } | |
251 bns_dump(bns, prefix); | |
252 bns_destroy(bns); | |
253 kseq_destroy(seq); | |
254 } | |
255 | |
256 int bwa_fa2pac(int argc, char *argv[]) | |
257 { | |
258 gzFile fp; | |
259 if (argc < 2) { | |
260 fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n"); | |
261 return 1; | |
262 } | |
263 fp = xzopen(argv[1], "r"); | |
264 bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]); | |
265 gzclose(fp); | |
266 return 0; | |
267 } | |
268 | |
269 int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq) | |
270 { | |
271 int left, mid, right, nn; | |
272 if (pac_coor >= bns->l_pac) | |
273 err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac); | |
274 // binary search for the sequence ID. Note that this is a bit different from the following one... | |
275 left = 0; mid = 0; right = bns->n_seqs; | |
276 while (left < right) { | |
277 mid = (left + right) >> 1; | |
278 if (pac_coor >= bns->anns[mid].offset) { | |
279 if (mid == bns->n_seqs - 1) break; | |
280 if (pac_coor < bns->anns[mid+1].offset) break; | |
281 left = mid + 1; | |
282 } else right = mid; | |
283 } | |
284 *real_seq = mid; | |
285 // binary search for holes | |
286 left = 0; right = bns->n_holes; nn = 0; | |
287 while (left < right) { | |
288 int64_t mid = (left + right) >> 1; | |
289 if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; | |
290 else if (pac_coor + len <= bns->ambs[mid].offset) right = mid; | |
291 else { // overlap | |
292 if (pac_coor >= bns->ambs[mid].offset) { | |
293 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? | |
294 bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len; | |
295 } else { | |
296 nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? | |
297 bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor); | |
298 } | |
299 break; | |
300 } | |
301 } | |
302 return nn; | |
303 } |