Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwtmisc.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2008 Genome Research Ltd (GRL). | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
27 | |
28 #include <stdlib.h> | |
29 #include <stdio.h> | |
30 #include <string.h> | |
31 #include <unistd.h> | |
32 #include "bntseq.h" | |
33 #include "utils.h" | |
34 #include "main.h" | |
35 #include "bwt.h" | |
36 | |
37 #ifdef _DIVBWT | |
38 #include "divsufsort.h" | |
39 #endif | |
40 | |
41 int is_bwt(ubyte_t *T, int n); | |
42 | |
43 int64_t bwa_seq_len(const char *fn_pac) | |
44 { | |
45 FILE *fp; | |
46 int64_t pac_len; | |
47 ubyte_t c; | |
48 fp = xopen(fn_pac, "rb"); | |
49 fseek(fp, -1, SEEK_END); | |
50 pac_len = ftell(fp); | |
51 fread(&c, 1, 1, fp); | |
52 fclose(fp); | |
53 return (pac_len - 1) * 4 + (int)c; | |
54 } | |
55 | |
56 bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) | |
57 { | |
58 bwt_t *bwt; | |
59 ubyte_t *buf, *buf2; | |
60 int i, pac_size; | |
61 FILE *fp; | |
62 | |
63 // initialization | |
64 bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); | |
65 bwt->seq_len = bwa_seq_len(fn_pac); | |
66 bwt->bwt_size = (bwt->seq_len + 15) >> 4; | |
67 fp = xopen(fn_pac, "rb"); | |
68 | |
69 // prepare sequence | |
70 pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); | |
71 buf2 = (ubyte_t*)calloc(pac_size, 1); | |
72 fread(buf2, 1, pac_size, fp); | |
73 fclose(fp); | |
74 memset(bwt->L2, 0, 5 * 4); | |
75 buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); | |
76 for (i = 0; i < bwt->seq_len; ++i) { | |
77 buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; | |
78 ++bwt->L2[1+buf[i]]; | |
79 } | |
80 for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; | |
81 free(buf2); | |
82 | |
83 // Burrows-Wheeler Transform | |
84 if (use_is) { | |
85 bwt->primary = is_bwt(buf, bwt->seq_len); | |
86 } else { | |
87 #ifdef _DIVBWT | |
88 bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); | |
89 #else | |
90 err_fatal_simple("libdivsufsort is not compiled in."); | |
91 #endif | |
92 } | |
93 bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); | |
94 for (i = 0; i < bwt->seq_len; ++i) | |
95 bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); | |
96 free(buf); | |
97 return bwt; | |
98 } | |
99 | |
100 int bwa_pac2bwt(int argc, char *argv[]) | |
101 { | |
102 bwt_t *bwt; | |
103 int c, use_is = 1; | |
104 while ((c = getopt(argc, argv, "d")) >= 0) { | |
105 switch (c) { | |
106 case 'd': use_is = 0; break; | |
107 default: return 1; | |
108 } | |
109 } | |
110 if (optind + 2 > argc) { | |
111 fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n"); | |
112 return 1; | |
113 } | |
114 bwt = bwt_pac2bwt(argv[optind], use_is); | |
115 bwt_dump_bwt(argv[optind+1], bwt); | |
116 bwt_destroy(bwt); | |
117 return 0; | |
118 } | |
119 | |
120 #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) | |
121 | |
122 void bwt_bwtupdate_core(bwt_t *bwt) | |
123 { | |
124 bwtint_t i, k, c[4], n_occ; | |
125 uint32_t *buf; | |
126 | |
127 n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; | |
128 bwt->bwt_size += n_occ * 4; // the new size | |
129 buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt | |
130 c[0] = c[1] = c[2] = c[3] = 0; | |
131 for (i = k = 0; i < bwt->seq_len; ++i) { | |
132 if (i % OCC_INTERVAL == 0) { | |
133 memcpy(buf + k, c, sizeof(bwtint_t) * 4); | |
134 k += 4; | |
135 } | |
136 if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; | |
137 ++c[bwt_B00(bwt, i)]; | |
138 } | |
139 // the last element | |
140 memcpy(buf + k, c, sizeof(bwtint_t) * 4); | |
141 xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size"); | |
142 // update bwt | |
143 free(bwt->bwt); bwt->bwt = buf; | |
144 } | |
145 | |
146 int bwa_bwtupdate(int argc, char *argv[]) | |
147 { | |
148 bwt_t *bwt; | |
149 if (argc < 2) { | |
150 fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n"); | |
151 return 1; | |
152 } | |
153 bwt = bwt_restore_bwt(argv[1]); | |
154 bwt_bwtupdate_core(bwt); | |
155 bwt_dump_bwt(argv[1], bwt); | |
156 bwt_destroy(bwt); | |
157 return 0; | |
158 } | |
159 | |
160 void bwa_pac_rev_core(const char *fn, const char *fn_rev) | |
161 { | |
162 int64_t seq_len, i; | |
163 bwtint_t pac_len, j; | |
164 ubyte_t *bufin, *bufout, ct; | |
165 FILE *fp; | |
166 seq_len = bwa_seq_len(fn); | |
167 pac_len = (seq_len >> 2) + 1; | |
168 bufin = (ubyte_t*)calloc(pac_len, 1); | |
169 bufout = (ubyte_t*)calloc(pac_len, 1); | |
170 fp = xopen(fn, "rb"); | |
171 fread(bufin, 1, pac_len, fp); | |
172 fclose(fp); | |
173 for (i = seq_len - 1, j = 0; i >= 0; --i) { | |
174 int c = bufin[i>>2] >> ((~i&3)<<1) & 3; | |
175 bwtint_t j = seq_len - 1 - i; | |
176 bufout[j>>2] |= c << ((~j&3)<<1); | |
177 } | |
178 free(bufin); | |
179 fp = xopen(fn_rev, "wb"); | |
180 fwrite(bufout, 1, pac_len, fp); | |
181 ct = seq_len % 4; | |
182 fwrite(&ct, 1, 1, fp); | |
183 fclose(fp); | |
184 free(bufout); | |
185 } | |
186 | |
187 int bwa_pac_rev(int argc, char *argv[]) | |
188 { | |
189 if (argc < 3) { | |
190 fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n"); | |
191 return 1; | |
192 } | |
193 bwa_pac_rev_core(argv[1], argv[2]); | |
194 return 0; | |
195 } | |
196 | |
197 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; | |
198 | |
199 /* this function is not memory efficient, but this will make life easier | |
200 Ideally we should also change .amb files as one 'N' in the nucleotide | |
201 sequence leads to two ambiguous colors. I may do this later... */ | |
202 uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) | |
203 { | |
204 uint8_t *pac, *cspac; | |
205 bwtint_t i; | |
206 int c1, c2; | |
207 pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); | |
208 cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); | |
209 fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); | |
210 rewind(bns->fp_pac); | |
211 c1 = pac[0]>>6; cspac[0] = c1<<6; | |
212 for (i = 1; i < bns->l_pac; ++i) { | |
213 c2 = pac[i>>2] >> (~i&3)*2 & 3; | |
214 cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2; | |
215 c1 = c2; | |
216 } | |
217 free(pac); | |
218 return cspac; | |
219 } | |
220 | |
221 int bwa_pac2cspac(int argc, char *argv[]) | |
222 { | |
223 bntseq_t *bns; | |
224 uint8_t *cspac, ct; | |
225 char *str; | |
226 FILE *fp; | |
227 | |
228 if (argc < 3) { | |
229 fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n"); | |
230 return 1; | |
231 } | |
232 bns = bns_restore(argv[1]); | |
233 cspac = bwa_pac2cspac_core(bns); | |
234 bns_dump(bns, argv[2]); | |
235 // now write cspac | |
236 str = (char*)calloc(strlen(argv[2]) + 5, 1); | |
237 strcat(strcpy(str, argv[2]), ".pac"); | |
238 fp = xopen(str, "wb"); | |
239 fwrite(cspac, 1, bns->l_pac/4 + 1, fp); | |
240 ct = bns->l_pac % 4; | |
241 fwrite(&ct, 1, 1, fp); | |
242 fclose(fp); | |
243 bns_destroy(bns); | |
244 free(cspac); | |
245 return 0; | |
246 } | |
247 | |
248 int bwa_bwt2sa(int argc, char *argv[]) | |
249 { | |
250 bwt_t *bwt; | |
251 int c, sa_intv = 32; | |
252 while ((c = getopt(argc, argv, "i:")) >= 0) { | |
253 switch (c) { | |
254 case 'i': sa_intv = atoi(optarg); break; | |
255 default: return 1; | |
256 } | |
257 } | |
258 if (optind + 2 > argc) { | |
259 fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv); | |
260 return 1; | |
261 } | |
262 bwt = bwt_restore_bwt(argv[optind]); | |
263 bwt_cal_sa(bwt, sa_intv); | |
264 bwt_dump_sa(argv[optind+1], bwt); | |
265 bwt_destroy(bwt); | |
266 return 0; | |
267 } |