annotate bwa-0.6.2/bwt.h @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 /* The MIT License
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 Copyright (c) 2008 Genome Research Ltd (GRL).
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 Permission is hereby granted, free of charge, to any person obtaining
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 a copy of this software and associated documentation files (the
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 "Software"), to deal in the Software without restriction, including
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 without limitation the rights to use, copy, modify, merge, publish,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 distribute, sublicense, and/or sell copies of the Software, and to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10 permit persons to whom the Software is furnished to do so, subject to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 the following conditions:
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 The above copyright notice and this permission notice shall be
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 included in all copies or substantial portions of the Software.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 SOFTWARE.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 #ifndef BWA_BWT_H
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 #define BWA_BWT_H
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 #include <stdint.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 #define OCC_INTERVAL 0x80
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36 #ifndef BWA_UBYTE
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 #define BWA_UBYTE
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 typedef unsigned char ubyte_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 #endif
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 typedef uint64_t bwtint_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 bwtint_t primary; // S^{-1}(0), or the primary index of BWT
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45 bwtint_t L2[5]; // C(), cumulative count
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 bwtint_t seq_len; // sequence length
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 bwtint_t bwt_size; // size of bwt, about seq_len/4
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 uint32_t *bwt; // BWT
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 // occurance array, separated to two parts
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 uint32_t cnt_table[256];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 // suffix array
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 int sa_intv;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 bwtint_t n_sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 bwtint_t *sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 } bwt_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 bwtint_t x[3], info;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 } bwtintv_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 /* For general OCC_INTERVAL, the following is correct:
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 // The following two lines are ONLY correct when OCC_INTERVAL==0x80
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 #define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 #define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4))
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 /* retrieve a character from the $-removed BWT string. Note that
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 * bwt_t::bwt is not exactly the BWT string and therefore this macro is
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 * called bwt_B0 instead of bwt_B */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 // inverse Psi function
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 #define bwt_invPsi(bwt, k) \
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 (((k) == (bwt)->primary)? 0 : \
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 ((k) < (bwt)->primary)? \
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 #ifdef __cplusplus
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87 extern "C" {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88 #endif
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 void bwt_dump_sa(const char *fn, const bwt_t *bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 bwt_t *bwt_restore_bwt(const char *fn);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 void bwt_restore_sa(const char *fn, bwt_t *bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 void bwt_destroy(bwt_t *bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 void bwt_cal_sa(bwt_t *bwt, int intv);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 void bwt_bwtupdate_core(bwt_t *bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 void bwt_gen_cnt_table(bwt_t *bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115 /**
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 * Extend bi-SA-interval _ik_
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118 void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120 /**
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122 * Return the end of the longest exact match starting from _x_.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 #ifdef __cplusplus
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
128 #endif
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
129
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
130 #endif