comparison bwa-0.6.2/bwtmisc.c @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dd1186b11b3b
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <unistd.h>
32 #include "bntseq.h"
33 #include "utils.h"
34 #include "main.h"
35 #include "bwt.h"
36
37 #ifdef _DIVBWT
38 #include "divsufsort.h"
39 #endif
40
41 int is_bwt(ubyte_t *T, int n);
42
43 int64_t bwa_seq_len(const char *fn_pac)
44 {
45 FILE *fp;
46 int64_t pac_len;
47 ubyte_t c;
48 fp = xopen(fn_pac, "rb");
49 fseek(fp, -1, SEEK_END);
50 pac_len = ftell(fp);
51 fread(&c, 1, 1, fp);
52 fclose(fp);
53 return (pac_len - 1) * 4 + (int)c;
54 }
55
56 bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
57 {
58 bwt_t *bwt;
59 ubyte_t *buf, *buf2;
60 int i, pac_size;
61 FILE *fp;
62
63 // initialization
64 bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
65 bwt->seq_len = bwa_seq_len(fn_pac);
66 bwt->bwt_size = (bwt->seq_len + 15) >> 4;
67 fp = xopen(fn_pac, "rb");
68
69 // prepare sequence
70 pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
71 buf2 = (ubyte_t*)calloc(pac_size, 1);
72 fread(buf2, 1, pac_size, fp);
73 fclose(fp);
74 memset(bwt->L2, 0, 5 * 4);
75 buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
76 for (i = 0; i < bwt->seq_len; ++i) {
77 buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
78 ++bwt->L2[1+buf[i]];
79 }
80 for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
81 free(buf2);
82
83 // Burrows-Wheeler Transform
84 if (use_is) {
85 bwt->primary = is_bwt(buf, bwt->seq_len);
86 } else {
87 #ifdef _DIVBWT
88 bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
89 #else
90 err_fatal_simple("libdivsufsort is not compiled in.");
91 #endif
92 }
93 bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
94 for (i = 0; i < bwt->seq_len; ++i)
95 bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
96 free(buf);
97 return bwt;
98 }
99
100 int bwa_pac2bwt(int argc, char *argv[])
101 {
102 bwt_t *bwt;
103 int c, use_is = 1;
104 while ((c = getopt(argc, argv, "d")) >= 0) {
105 switch (c) {
106 case 'd': use_is = 0; break;
107 default: return 1;
108 }
109 }
110 if (optind + 2 > argc) {
111 fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
112 return 1;
113 }
114 bwt = bwt_pac2bwt(argv[optind], use_is);
115 bwt_dump_bwt(argv[optind+1], bwt);
116 bwt_destroy(bwt);
117 return 0;
118 }
119
120 #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
121
122 void bwt_bwtupdate_core(bwt_t *bwt)
123 {
124 bwtint_t i, k, c[4], n_occ;
125 uint32_t *buf;
126
127 n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
128 bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
129 buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
130 c[0] = c[1] = c[2] = c[3] = 0;
131 for (i = k = 0; i < bwt->seq_len; ++i) {
132 if (i % OCC_INTERVAL == 0) {
133 memcpy(buf + k, c, sizeof(bwtint_t) * 4);
134 k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
135 }
136 if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
137 ++c[bwt_B00(bwt, i)];
138 }
139 // the last element
140 memcpy(buf + k, c, sizeof(bwtint_t) * 4);
141 xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
142 // update bwt
143 free(bwt->bwt); bwt->bwt = buf;
144 }
145
146 int bwa_bwtupdate(int argc, char *argv[])
147 {
148 bwt_t *bwt;
149 if (argc < 2) {
150 fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
151 return 1;
152 }
153 bwt = bwt_restore_bwt(argv[1]);
154 bwt_bwtupdate_core(bwt);
155 bwt_dump_bwt(argv[1], bwt);
156 bwt_destroy(bwt);
157 return 0;
158 }
159
160 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
161
162 /* this function is not memory efficient, but this will make life easier
163 Ideally we should also change .amb files as one 'N' in the nucleotide
164 sequence leads to two ambiguous colors. I may do this later... */
165 uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
166 {
167 uint8_t *pac, *cspac;
168 bwtint_t i;
169 int c1, c2;
170 pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
171 cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
172 fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
173 rewind(bns->fp_pac);
174 c1 = pac[0]>>6; cspac[0] = c1<<6;
175 for (i = 1; i < bns->l_pac; ++i) {
176 c2 = pac[i>>2] >> (~i&3)*2 & 3;
177 cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
178 c1 = c2;
179 }
180 free(pac);
181 return cspac;
182 }
183
184 int bwa_pac2cspac(int argc, char *argv[])
185 {
186 bntseq_t *bns;
187 uint8_t *cspac, ct;
188 char *str;
189 FILE *fp;
190
191 if (argc < 3) {
192 fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
193 return 1;
194 }
195 bns = bns_restore(argv[1]);
196 cspac = bwa_pac2cspac_core(bns);
197 bns_dump(bns, argv[2]);
198 // now write cspac
199 str = (char*)calloc(strlen(argv[2]) + 5, 1);
200 strcat(strcpy(str, argv[2]), ".pac");
201 fp = xopen(str, "wb");
202 fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
203 ct = bns->l_pac % 4;
204 fwrite(&ct, 1, 1, fp);
205 fclose(fp);
206 bns_destroy(bns);
207 free(cspac);
208 return 0;
209 }
210
211 int bwa_bwt2sa(int argc, char *argv[])
212 {
213 bwt_t *bwt;
214 int c, sa_intv = 32;
215 while ((c = getopt(argc, argv, "i:")) >= 0) {
216 switch (c) {
217 case 'i': sa_intv = atoi(optarg); break;
218 default: return 1;
219 }
220 }
221 if (optind + 2 > argc) {
222 fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
223 return 1;
224 }
225 bwt = bwt_restore_bwt(argv[optind]);
226 bwt_cal_sa(bwt, sa_intv);
227 bwt_dump_sa(argv[optind+1], bwt);
228 bwt_destroy(bwt);
229 return 0;
230 }