Mercurial > repos > ashvark > qiime_1_8_0
comparison bwa-0.6.2/bwtmisc.c @ 0:dd1186b11b3b draft
Uploaded BWA
author | ashvark |
---|---|
date | Fri, 18 Jul 2014 07:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dd1186b11b3b |
---|---|
1 /* The MIT License | |
2 | |
3 Copyright (c) 2008 Genome Research Ltd (GRL). | |
4 | |
5 Permission is hereby granted, free of charge, to any person obtaining | |
6 a copy of this software and associated documentation files (the | |
7 "Software"), to deal in the Software without restriction, including | |
8 without limitation the rights to use, copy, modify, merge, publish, | |
9 distribute, sublicense, and/or sell copies of the Software, and to | |
10 permit persons to whom the Software is furnished to do so, subject to | |
11 the following conditions: | |
12 | |
13 The above copyright notice and this permission notice shall be | |
14 included in all copies or substantial portions of the Software. | |
15 | |
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
23 SOFTWARE. | |
24 */ | |
25 | |
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
27 | |
28 #include <stdlib.h> | |
29 #include <stdio.h> | |
30 #include <string.h> | |
31 #include <unistd.h> | |
32 #include "bntseq.h" | |
33 #include "utils.h" | |
34 #include "main.h" | |
35 #include "bwt.h" | |
36 | |
37 #ifdef _DIVBWT | |
38 #include "divsufsort.h" | |
39 #endif | |
40 | |
41 int is_bwt(ubyte_t *T, int n); | |
42 | |
43 int64_t bwa_seq_len(const char *fn_pac) | |
44 { | |
45 FILE *fp; | |
46 int64_t pac_len; | |
47 ubyte_t c; | |
48 fp = xopen(fn_pac, "rb"); | |
49 fseek(fp, -1, SEEK_END); | |
50 pac_len = ftell(fp); | |
51 fread(&c, 1, 1, fp); | |
52 fclose(fp); | |
53 return (pac_len - 1) * 4 + (int)c; | |
54 } | |
55 | |
56 bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) | |
57 { | |
58 bwt_t *bwt; | |
59 ubyte_t *buf, *buf2; | |
60 int i, pac_size; | |
61 FILE *fp; | |
62 | |
63 // initialization | |
64 bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); | |
65 bwt->seq_len = bwa_seq_len(fn_pac); | |
66 bwt->bwt_size = (bwt->seq_len + 15) >> 4; | |
67 fp = xopen(fn_pac, "rb"); | |
68 | |
69 // prepare sequence | |
70 pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); | |
71 buf2 = (ubyte_t*)calloc(pac_size, 1); | |
72 fread(buf2, 1, pac_size, fp); | |
73 fclose(fp); | |
74 memset(bwt->L2, 0, 5 * 4); | |
75 buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); | |
76 for (i = 0; i < bwt->seq_len; ++i) { | |
77 buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; | |
78 ++bwt->L2[1+buf[i]]; | |
79 } | |
80 for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; | |
81 free(buf2); | |
82 | |
83 // Burrows-Wheeler Transform | |
84 if (use_is) { | |
85 bwt->primary = is_bwt(buf, bwt->seq_len); | |
86 } else { | |
87 #ifdef _DIVBWT | |
88 bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); | |
89 #else | |
90 err_fatal_simple("libdivsufsort is not compiled in."); | |
91 #endif | |
92 } | |
93 bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); | |
94 for (i = 0; i < bwt->seq_len; ++i) | |
95 bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); | |
96 free(buf); | |
97 return bwt; | |
98 } | |
99 | |
100 int bwa_pac2bwt(int argc, char *argv[]) | |
101 { | |
102 bwt_t *bwt; | |
103 int c, use_is = 1; | |
104 while ((c = getopt(argc, argv, "d")) >= 0) { | |
105 switch (c) { | |
106 case 'd': use_is = 0; break; | |
107 default: return 1; | |
108 } | |
109 } | |
110 if (optind + 2 > argc) { | |
111 fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n"); | |
112 return 1; | |
113 } | |
114 bwt = bwt_pac2bwt(argv[optind], use_is); | |
115 bwt_dump_bwt(argv[optind+1], bwt); | |
116 bwt_destroy(bwt); | |
117 return 0; | |
118 } | |
119 | |
120 #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) | |
121 | |
122 void bwt_bwtupdate_core(bwt_t *bwt) | |
123 { | |
124 bwtint_t i, k, c[4], n_occ; | |
125 uint32_t *buf; | |
126 | |
127 n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; | |
128 bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size | |
129 buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt | |
130 c[0] = c[1] = c[2] = c[3] = 0; | |
131 for (i = k = 0; i < bwt->seq_len; ++i) { | |
132 if (i % OCC_INTERVAL == 0) { | |
133 memcpy(buf + k, c, sizeof(bwtint_t) * 4); | |
134 k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) | |
135 } | |
136 if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 | |
137 ++c[bwt_B00(bwt, i)]; | |
138 } | |
139 // the last element | |
140 memcpy(buf + k, c, sizeof(bwtint_t) * 4); | |
141 xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); | |
142 // update bwt | |
143 free(bwt->bwt); bwt->bwt = buf; | |
144 } | |
145 | |
146 int bwa_bwtupdate(int argc, char *argv[]) | |
147 { | |
148 bwt_t *bwt; | |
149 if (argc < 2) { | |
150 fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n"); | |
151 return 1; | |
152 } | |
153 bwt = bwt_restore_bwt(argv[1]); | |
154 bwt_bwtupdate_core(bwt); | |
155 bwt_dump_bwt(argv[1], bwt); | |
156 bwt_destroy(bwt); | |
157 return 0; | |
158 } | |
159 | |
160 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; | |
161 | |
162 /* this function is not memory efficient, but this will make life easier | |
163 Ideally we should also change .amb files as one 'N' in the nucleotide | |
164 sequence leads to two ambiguous colors. I may do this later... */ | |
165 uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) | |
166 { | |
167 uint8_t *pac, *cspac; | |
168 bwtint_t i; | |
169 int c1, c2; | |
170 pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); | |
171 cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); | |
172 fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); | |
173 rewind(bns->fp_pac); | |
174 c1 = pac[0]>>6; cspac[0] = c1<<6; | |
175 for (i = 1; i < bns->l_pac; ++i) { | |
176 c2 = pac[i>>2] >> (~i&3)*2 & 3; | |
177 cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2; | |
178 c1 = c2; | |
179 } | |
180 free(pac); | |
181 return cspac; | |
182 } | |
183 | |
184 int bwa_pac2cspac(int argc, char *argv[]) | |
185 { | |
186 bntseq_t *bns; | |
187 uint8_t *cspac, ct; | |
188 char *str; | |
189 FILE *fp; | |
190 | |
191 if (argc < 3) { | |
192 fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n"); | |
193 return 1; | |
194 } | |
195 bns = bns_restore(argv[1]); | |
196 cspac = bwa_pac2cspac_core(bns); | |
197 bns_dump(bns, argv[2]); | |
198 // now write cspac | |
199 str = (char*)calloc(strlen(argv[2]) + 5, 1); | |
200 strcat(strcpy(str, argv[2]), ".pac"); | |
201 fp = xopen(str, "wb"); | |
202 fwrite(cspac, 1, bns->l_pac/4 + 1, fp); | |
203 ct = bns->l_pac % 4; | |
204 fwrite(&ct, 1, 1, fp); | |
205 fclose(fp); | |
206 bns_destroy(bns); | |
207 free(cspac); | |
208 return 0; | |
209 } | |
210 | |
211 int bwa_bwt2sa(int argc, char *argv[]) | |
212 { | |
213 bwt_t *bwt; | |
214 int c, sa_intv = 32; | |
215 while ((c = getopt(argc, argv, "i:")) >= 0) { | |
216 switch (c) { | |
217 case 'i': sa_intv = atoi(optarg); break; | |
218 default: return 1; | |
219 } | |
220 } | |
221 if (optind + 2 > argc) { | |
222 fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv); | |
223 return 1; | |
224 } | |
225 bwt = bwt_restore_bwt(argv[optind]); | |
226 bwt_cal_sa(bwt, sa_intv); | |
227 bwt_dump_sa(argv[optind+1], bwt); | |
228 bwt_destroy(bwt); | |
229 return 0; | |
230 } |