diff bwa-0.6.2/bwtindex.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bwa-0.6.2/bwtindex.c	Fri Jul 18 07:55:59 2014 -0400
@@ -0,0 +1,158 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <zlib.h>
+#include "bntseq.h"
+#include "bwt.h"
+#include "main.h"
+#include "utils.h"
+
+bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
+void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+
+int bwa_index(int argc, char *argv[])
+{
+	char *prefix = 0, *str, *str2, *str3;
+	int c, algo_type = 0, is_color = 0, is_64 = 0;
+	clock_t t;
+	int64_t l_pac;
+
+	while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
+		switch (c) {
+		case 'a': // if -a is not set, algo_type will be determined later
+			if (strcmp(optarg, "div") == 0) algo_type = 1;
+			else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
+			else if (strcmp(optarg, "is") == 0) algo_type = 3;
+			else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
+			break;
+		case 'p': prefix = strdup(optarg); break;
+		case 'c': is_color = 1; break;
+		case '6': is_64 = 1; break;
+		default: return 1;
+		}
+	}
+
+	if (optind + 1 > argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n");
+		fprintf(stderr, "Options: -a STR    BWT construction algorithm: bwtsw or is [auto]\n");
+		fprintf(stderr, "         -p STR    prefix of the index [same as fasta name]\n");
+		fprintf(stderr, "         -6        index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
+//		fprintf(stderr, "         -c        build color-space index\n");
+		fprintf(stderr, "\n");
+		fprintf(stderr,	"Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
+		fprintf(stderr, "         `-a div' do not work not for long genomes. Please choose `-a'\n");
+		fprintf(stderr, "         according to the length of the genome.\n\n");
+		return 1;
+	}
+	if (prefix == 0) {
+		prefix = malloc(strlen(argv[optind]) + 4);
+		strcpy(prefix, argv[optind]);
+		if (is_64) strcat(prefix, ".64");
+	}
+	str  = (char*)calloc(strlen(prefix) + 10, 1);
+	str2 = (char*)calloc(strlen(prefix) + 10, 1);
+	str3 = (char*)calloc(strlen(prefix) + 10, 1);
+
+	if (is_color == 0) { // nucleotide indexing
+		gzFile fp = xzopen(argv[optind], "r");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Pack FASTA... ");
+		l_pac = bns_fasta2bntseq(fp, prefix, 0);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+	} else { // color indexing
+		gzFile fp = xzopen(argv[optind], "r");
+		strcat(strcpy(str, prefix), ".nt");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
+		l_pac = bns_fasta2bntseq(fp, str, 0);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+		{
+			char *tmp_argv[3];
+			tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
+			t = clock();
+			fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
+			bwa_pac2cspac(3, tmp_argv);
+			fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		}
+	}
+	if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
+	{
+		strcpy(str, prefix); strcat(str, ".pac");
+		strcpy(str2, prefix); strcat(str2, ".bwt");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
+		if (algo_type == 2) bwt_bwtgen(str, str2);
+		else if (algo_type == 1 || algo_type == 3) {
+			bwt_t *bwt;
+			bwt = bwt_pac2bwt(str, algo_type == 3);
+			bwt_dump_bwt(str2, bwt);
+			bwt_destroy(bwt);
+		}
+		fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	{
+		bwt_t *bwt;
+		strcpy(str, prefix); strcat(str, ".bwt");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Update BWT... ");
+		bwt = bwt_restore_bwt(str);
+		bwt_bwtupdate_core(bwt);
+		bwt_dump_bwt(str, bwt);
+		bwt_destroy(bwt);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	{
+		gzFile fp = xzopen(argv[optind], "r");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
+		l_pac = bns_fasta2bntseq(fp, prefix, 1);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+	}
+	{
+		bwt_t *bwt;
+		strcpy(str, prefix); strcat(str, ".bwt");
+		strcpy(str3, prefix); strcat(str3, ".sa");
+		t = clock();
+		fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
+		bwt = bwt_restore_bwt(str);
+		bwt_cal_sa(bwt, 32);
+		bwt_dump_sa(str3, bwt);
+		bwt_destroy(bwt);
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	free(str3); free(str2); free(str); free(prefix);
+	return 0;
+}