Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/tools/bwa-0.5.7-mh/is.c @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/tools/bwa-0.5.7-mh/is.c Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,218 @@ +/* + * sais.c for sais-lite + * Copyright (c) 2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <stdlib.h> + +typedef unsigned char ubyte_t; +#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) + +/* find the start or end of each bucket */ +static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) +{ + int i; + for (i = 0; i < k; ++i) C[i] = 0; + for (i = 0; i < n; ++i) ++C[chr(i)]; +} +static void getBuckets(const int *C, int *B, int k, int end) +{ + int i, sum = 0; + if (end) { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum; + } + } else { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum - C[i]; + } + } +} + +/* compute SA */ +static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) +{ + int *b, i, j; + int c0, c1; + /* compute SAl */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 0); /* find starts of buckets */ + j = n - 1; + b = SA + B[c1 = chr(j)]; + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + for (i = 0; i < n; ++i) { + j = SA[i], SA[i] = ~j; + if (0 < j) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + } + } + /* compute SAs */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { + if (0 < (j = SA[i])) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; + } else SA[i] = ~j; + } +} + +/* + * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working + * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet + */ +static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) +{ + int *C, *B, *RA; + int i, j, c, m, p, q, plen, qlen, name; + int c0, c1; + int diff; + + /* stage 1: reduce the problem by at least 1/2 sort all the + * S-substrings */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = 0; i < n; ++i) SA[i] = 0; + for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) SA[--B[c1]] = i + 1, c = 0; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + /* compact all the sorted substrings into the first m items of SA + * 2*m must be not larger than n (proveable) */ + for (i = 0, m = 0; i < n; ++i) { + p = SA[i]; + if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { + for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); + if ((j < n) && (c0 < c1)) SA[m++] = p; + } + } + for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ + /* store the length of all substrings */ + for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) { + SA[m + ((i + 1) >> 1)] = j - i - 1; + j = i + 1; + c = 0; + } + } + /* find the lexicographic names of all substrings */ + for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { + p = SA[i], plen = SA[m + (p >> 1)], diff = 1; + if (plen == qlen) { + for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); + if (j == plen) diff = 0; + } + if (diff != 0) ++name, q = p, qlen = plen; + SA[m + (p >> 1)] = name; + } + + /* stage 2: solve the reduced problem recurse if names are not yet + * unique */ + if (name < m) { + RA = SA + n + fs - m; + for (i = n - 1, j = m - 1; m <= i; --i) { + if (SA[i] != 0) RA[j--] = SA[i] - 1; + } + if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; + for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ + } + for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ + } + /* stage 3: induce the result for the original problem */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + /* put all left-most S characters into their buckets */ + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ + for (i = m - 1; 0 <= i; --i) { + j = SA[i], SA[i] = 0; + SA[--B[chr(j)]] = j; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + return 0; +} + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred + */ +int is_sa(const ubyte_t *T, int *SA, int n) +{ + if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; + SA[0] = n; + if (n <= 1) { + if (n == 1) SA[1] = 0; + return 0; + } + return sais_main(T, SA+1, 0, n, 256, 1); +} + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +int is_bwt(ubyte_t *T, int n) +{ + int *SA, i, primary = 0; + SA = (int*)calloc(n+1, sizeof(int)); + is_sa(T, SA, n); + + for (i = 0; i <= n; ++i) { + if (SA[i] == 0) primary = i; + else SA[i] = T[SA[i] - 1]; + } + for (i = 0; i < primary; ++i) T[i] = SA[i]; + for (; i < n; ++i) T[i] = SA[i + 1]; + free(SA); + return primary; +}