comparison bwa-0.6.2/bwt.h @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dd1186b11b3b
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #ifndef BWA_BWT_H
29 #define BWA_BWT_H
30
31 #include <stdint.h>
32
33 // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line
34 #define OCC_INTERVAL 0x80
35
36 #ifndef BWA_UBYTE
37 #define BWA_UBYTE
38 typedef unsigned char ubyte_t;
39 #endif
40
41 typedef uint64_t bwtint_t;
42
43 typedef struct {
44 bwtint_t primary; // S^{-1}(0), or the primary index of BWT
45 bwtint_t L2[5]; // C(), cumulative count
46 bwtint_t seq_len; // sequence length
47 bwtint_t bwt_size; // size of bwt, about seq_len/4
48 uint32_t *bwt; // BWT
49 // occurance array, separated to two parts
50 uint32_t cnt_table[256];
51 // suffix array
52 int sa_intv;
53 bwtint_t n_sa;
54 bwtint_t *sa;
55 } bwt_t;
56
57 typedef struct {
58 bwtint_t x[3], info;
59 } bwtintv_t;
60
61 typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
62
63 /* For general OCC_INTERVAL, the following is correct:
64 #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
65 #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
66 */
67
68 // The following two lines are ONLY correct when OCC_INTERVAL==0x80
69 #define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)])
70 #define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4))
71
72 /* retrieve a character from the $-removed BWT string. Note that
73 * bwt_t::bwt is not exactly the BWT string and therefore this macro is
74 * called bwt_B0 instead of bwt_B */
75 #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
76
77 // inverse Psi function
78 #define bwt_invPsi(bwt, k) \
79 (((k) == (bwt)->primary)? 0 : \
80 ((k) < (bwt)->primary)? \
81 (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
82 : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
83
84 #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
85
86 #ifdef __cplusplus
87 extern "C" {
88 #endif
89
90 void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
91 void bwt_dump_sa(const char *fn, const bwt_t *bwt);
92
93 bwt_t *bwt_restore_bwt(const char *fn);
94 void bwt_restore_sa(const char *fn, bwt_t *bwt);
95
96 void bwt_destroy(bwt_t *bwt);
97
98 void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
99 void bwt_cal_sa(bwt_t *bwt, int intv);
100
101 void bwt_bwtupdate_core(bwt_t *bwt);
102
103 inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
104 inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
105 bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
106
107 // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
108 void bwt_gen_cnt_table(bwt_t *bwt);
109 inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
110 inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
111
112 int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
113 int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
114
115 /**
116 * Extend bi-SA-interval _ik_
117 */
118 void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
119
120 /**
121 * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
122 * Return the end of the longest exact match starting from _x_.
123 */
124 int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
125
126 #ifdef __cplusplus
127 }
128 #endif
129
130 #endif