comparison bwa-0.6.2/bwtindex.c @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dd1186b11b3b
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 #include <time.h>
33 #include <zlib.h>
34 #include "bntseq.h"
35 #include "bwt.h"
36 #include "main.h"
37 #include "utils.h"
38
39 bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
40 void bwa_pac_rev_core(const char *fn, const char *fn_rev);
41
42 int bwa_index(int argc, char *argv[])
43 {
44 char *prefix = 0, *str, *str2, *str3;
45 int c, algo_type = 0, is_color = 0, is_64 = 0;
46 clock_t t;
47 int64_t l_pac;
48
49 while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
50 switch (c) {
51 case 'a': // if -a is not set, algo_type will be determined later
52 if (strcmp(optarg, "div") == 0) algo_type = 1;
53 else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
54 else if (strcmp(optarg, "is") == 0) algo_type = 3;
55 else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
56 break;
57 case 'p': prefix = strdup(optarg); break;
58 case 'c': is_color = 1; break;
59 case '6': is_64 = 1; break;
60 default: return 1;
61 }
62 }
63
64 if (optind + 1 > argc) {
65 fprintf(stderr, "\n");
66 fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n");
67 fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
68 fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
69 fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
70 // fprintf(stderr, " -c build color-space index\n");
71 fprintf(stderr, "\n");
72 fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
73 fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
74 fprintf(stderr, " according to the length of the genome.\n\n");
75 return 1;
76 }
77 if (prefix == 0) {
78 prefix = malloc(strlen(argv[optind]) + 4);
79 strcpy(prefix, argv[optind]);
80 if (is_64) strcat(prefix, ".64");
81 }
82 str = (char*)calloc(strlen(prefix) + 10, 1);
83 str2 = (char*)calloc(strlen(prefix) + 10, 1);
84 str3 = (char*)calloc(strlen(prefix) + 10, 1);
85
86 if (is_color == 0) { // nucleotide indexing
87 gzFile fp = xzopen(argv[optind], "r");
88 t = clock();
89 fprintf(stderr, "[bwa_index] Pack FASTA... ");
90 l_pac = bns_fasta2bntseq(fp, prefix, 0);
91 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
92 gzclose(fp);
93 } else { // color indexing
94 gzFile fp = xzopen(argv[optind], "r");
95 strcat(strcpy(str, prefix), ".nt");
96 t = clock();
97 fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
98 l_pac = bns_fasta2bntseq(fp, str, 0);
99 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
100 gzclose(fp);
101 {
102 char *tmp_argv[3];
103 tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
104 t = clock();
105 fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
106 bwa_pac2cspac(3, tmp_argv);
107 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
108 }
109 }
110 if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
111 {
112 strcpy(str, prefix); strcat(str, ".pac");
113 strcpy(str2, prefix); strcat(str2, ".bwt");
114 t = clock();
115 fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
116 if (algo_type == 2) bwt_bwtgen(str, str2);
117 else if (algo_type == 1 || algo_type == 3) {
118 bwt_t *bwt;
119 bwt = bwt_pac2bwt(str, algo_type == 3);
120 bwt_dump_bwt(str2, bwt);
121 bwt_destroy(bwt);
122 }
123 fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
124 }
125 {
126 bwt_t *bwt;
127 strcpy(str, prefix); strcat(str, ".bwt");
128 t = clock();
129 fprintf(stderr, "[bwa_index] Update BWT... ");
130 bwt = bwt_restore_bwt(str);
131 bwt_bwtupdate_core(bwt);
132 bwt_dump_bwt(str, bwt);
133 bwt_destroy(bwt);
134 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
135 }
136 {
137 gzFile fp = xzopen(argv[optind], "r");
138 t = clock();
139 fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
140 l_pac = bns_fasta2bntseq(fp, prefix, 1);
141 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
142 gzclose(fp);
143 }
144 {
145 bwt_t *bwt;
146 strcpy(str, prefix); strcat(str, ".bwt");
147 strcpy(str3, prefix); strcat(str3, ".sa");
148 t = clock();
149 fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
150 bwt = bwt_restore_bwt(str);
151 bwt_cal_sa(bwt, 32);
152 bwt_dump_sa(str3, bwt);
153 bwt_destroy(bwt);
154 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
155 }
156 free(str3); free(str2); free(str); free(prefix);
157 return 0;
158 }