diff bwa-0.6.2/bwtaln.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bwa-0.6.2/bwtaln.c	Fri Jul 18 07:55:59 2014 -0400
@@ -0,0 +1,356 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "bwtaln.h"
+#include "bwtgap.h"
+#include "utils.h"
+
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+gap_opt_t *gap_init_opt()
+{
+	gap_opt_t *o;
+	o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
+	/* IMPORTANT: s_mm*10 should be about the average base error
+	   rate. Voilating this requirement will break pairing! */
+	o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
+	o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
+	o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
+	o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
+	o->seed_len = 32; o->max_seed_diff = 2;
+	o->fnr = 0.04;
+	o->n_threads = 1;
+	o->max_top2 = 30;
+	o->trim_qual = 0;
+	return o;
+}
+
+int bwa_cal_maxdiff(int l, double err, double thres)
+{
+	double elambda = exp(-l * err);
+	double sum, y = 1.0;
+	int k, x = 1;
+	for (k = 1, sum = elambda; k < 1000; ++k) {
+		y *= l * err;
+		x *= k;
+		sum += elambda * y / x;
+		if (1.0 - sum < thres) return k;
+	}
+	return 2;
+}
+
+// width must be filled as zero
+int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width)
+{
+	bwtint_t k, l, ok, ol;
+	int i, bid;
+	bid = 0;
+	k = 0; l = bwt->seq_len;
+	for (i = 0; i < len; ++i) {
+		ubyte_t c = str[i];
+		if (c < 4) {
+			bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
+			k = bwt->L2[c] + ok + 1;
+			l = bwt->L2[c] + ol;
+		}
+		if (k > l || c > 3) { // then restart
+			k = 0;
+			l = bwt->seq_len;
+			++bid;
+		}
+		width[i].w = l - k + 1;
+		width[i].bid = bid;
+	}
+	width[len].w = 0;
+	width[len].bid = ++bid;
+	return bid;
+}
+
+void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
+{
+	int i, j, max_l = 0, max_len;
+	gap_stack_t *stack;
+	bwt_width_t *w, *seed_w;
+	gap_opt_t local_opt = *opt;
+
+	// initiate priority stack
+	for (i = max_len = 0; i != n_seqs; ++i)
+		if (seqs[i].len > max_len) max_len = seqs[i].len;
+	if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
+	if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
+	stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
+
+	seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
+	w = 0;
+	for (i = 0; i != n_seqs; ++i) {
+		bwa_seq_t *p = seqs + i;
+#ifdef HAVE_PTHREAD
+		if (i % opt->n_threads != tid) continue;
+#endif
+		p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
+		if (max_l < p->len) {
+			max_l = p->len;
+			w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t));
+			memset(w, 0, (max_l + 1) * sizeof(bwt_width_t));
+		}
+		bwt_cal_width(bwt, p->len, p->seq, w);
+		if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
+		local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
+		if (p->len > opt->seed_len)
+			bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w);
+		// core function
+		for (j = 0; j < p->len; ++j) // we need to complement
+			p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j];
+		p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
+		// clean up the unused data in the record
+		free(p->name); free(p->seq); free(p->rseq); free(p->qual);
+		p->name = 0; p->seq = p->rseq = p->qual = 0;
+	}
+	free(seed_w); free(w);
+	gap_destroy_stack(stack);
+}
+
+#ifdef HAVE_PTHREAD
+typedef struct {
+	int tid;
+	bwt_t *bwt;
+	int n_seqs;
+	bwa_seq_t *seqs;
+	const gap_opt_t *opt;
+} thread_aux_t;
+
+static void *worker(void *data)
+{
+	thread_aux_t *d = (thread_aux_t*)data;
+	bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
+	return 0;
+}
+#endif
+
+bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
+{
+	bwa_seqio_t *ks;
+	if (mode & BWA_MODE_BAM) { // open BAM
+		int which = 0;
+		if (mode & BWA_MODE_BAM_SE) which |= 4;
+		if (mode & BWA_MODE_BAM_READ1) which |= 1;
+		if (mode & BWA_MODE_BAM_READ2) which |= 2;
+		if (which == 0) which = 7; // then read all reads
+		ks = bwa_bam_open(fn_fa, which);
+	} else ks = bwa_seq_open(fn_fa);
+	return ks;
+}
+
+void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
+{
+	int i, n_seqs, tot_seqs = 0;
+	bwa_seq_t *seqs;
+	bwa_seqio_t *ks;
+	clock_t t;
+	bwt_t *bwt;
+
+	// initialization
+	ks = bwa_open_reads(opt->mode, fn_fa);
+
+	{ // load BWT
+		char *str = (char*)calloc(strlen(prefix) + 10, 1);
+		strcpy(str, prefix); strcat(str, ".bwt");  bwt = bwt_restore_bwt(str);
+		free(str);
+	}
+
+	// core loop
+	err_fwrite(opt, sizeof(gap_opt_t), 1, stdout);
+	while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
+		tot_seqs += n_seqs;
+		t = clock();
+
+		fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
+
+#ifdef HAVE_PTHREAD
+		if (opt->n_threads <= 1) { // no multi-threading at all
+			bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
+		} else {
+			pthread_t *tid;
+			pthread_attr_t attr;
+			thread_aux_t *data;
+			int j;
+			pthread_attr_init(&attr);
+			pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+			data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
+			tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
+			for (j = 0; j < opt->n_threads; ++j) {
+				data[j].tid = j; data[j].bwt = bwt;
+				data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
+				pthread_create(&tid[j], &attr, worker, data + j);
+			}
+			for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
+			free(data); free(tid);
+		}
+#else
+		bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
+#endif
+
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+
+		t = clock();
+		fprintf(stderr, "[bwa_aln_core] write to the disk... ");
+		for (i = 0; i < n_seqs; ++i) {
+			bwa_seq_t *p = seqs + i;
+			err_fwrite(&p->n_aln, 4, 1, stdout);
+			if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
+		}
+		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+
+		bwa_free_read_seq(n_seqs, seqs);
+		fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
+	}
+
+	// destroy
+	bwt_destroy(bwt);
+	bwa_seq_close(ks);
+}
+
+char *bwa_infer_prefix(const char *hint)
+{
+	char *prefix;
+	int l_hint;
+	FILE *fp;
+	l_hint = strlen(hint);
+	prefix = malloc(l_hint + 3 + 4 + 1);
+	strcpy(prefix, hint);
+	strcpy(prefix + l_hint, ".64.bwt");
+	if ((fp = fopen(prefix, "rb")) != 0) {
+		fclose(fp);
+		prefix[l_hint + 3] = 0;
+		return prefix;
+	} else {
+		strcpy(prefix + l_hint, ".bwt");
+		if ((fp = fopen(prefix, "rb")) == 0) {
+			free(prefix);
+			return 0;
+		} else {
+			fclose(fp);
+			prefix[l_hint] = 0;
+			return prefix;
+		}
+	}
+}
+
+int bwa_aln(int argc, char *argv[])
+{
+	int c, opte = -1;
+	gap_opt_t *opt;
+	char *prefix;
+
+	opt = gap_init_opt();
+	while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
+		switch (c) {
+		case 'n':
+			if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
+			else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
+			break;
+		case 'o': opt->max_gapo = atoi(optarg); break;
+		case 'e': opte = atoi(optarg); break;
+		case 'M': opt->s_mm = atoi(optarg); break;
+		case 'O': opt->s_gapo = atoi(optarg); break;
+		case 'E': opt->s_gape = atoi(optarg); break;
+		case 'd': opt->max_del_occ = atoi(optarg); break;
+		case 'i': opt->indel_end_skip = atoi(optarg); break;
+		case 'l': opt->seed_len = atoi(optarg); break;
+		case 'k': opt->max_seed_diff = atoi(optarg); break;
+		case 'm': opt->max_entries = atoi(optarg); break;
+		case 't': opt->n_threads = atoi(optarg); break;
+		case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
+		case 'R': opt->max_top2 = atoi(optarg); break;
+		case 'q': opt->trim_qual = atoi(optarg); break;
+		case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
+		case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
+		case 'f': xreopen(optarg, "wb", stdout); break;
+		case 'b': opt->mode |= BWA_MODE_BAM; break;
+		case '0': opt->mode |= BWA_MODE_BAM_SE; break;
+		case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
+		case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
+		case 'I': opt->mode |= BWA_MODE_IL13; break;
+		case 'Y': opt->mode |= BWA_MODE_CFY; break;
+		case 'B': opt->mode |= atoi(optarg) << 24; break;
+		default: return 1;
+		}
+	}
+	if (opte > 0) {
+		opt->max_gape = opte;
+		opt->mode &= ~BWA_MODE_GAPE;
+	}
+
+	if (optind + 2 > argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   bwa aln [options] <prefix> <in.fq>\n\n");
+		fprintf(stderr, "Options: -n NUM    max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
+				BWA_AVG_ERR, opt->fnr);
+		fprintf(stderr, "         -o INT    maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
+		fprintf(stderr, "         -e INT    maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
+		fprintf(stderr, "         -i INT    do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
+		fprintf(stderr, "         -d INT    maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
+		fprintf(stderr, "         -l INT    seed length [%d]\n", opt->seed_len);
+		fprintf(stderr, "         -k INT    maximum differences in the seed [%d]\n", opt->max_seed_diff);
+		fprintf(stderr, "         -m INT    maximum entries in the queue [%d]\n", opt->max_entries);
+		fprintf(stderr, "         -t INT    number of threads [%d]\n", opt->n_threads);
+		fprintf(stderr, "         -M INT    mismatch penalty [%d]\n", opt->s_mm);
+		fprintf(stderr, "         -O INT    gap open penalty [%d]\n", opt->s_gapo);
+		fprintf(stderr, "         -E INT    gap extension penalty [%d]\n", opt->s_gape);
+		fprintf(stderr, "         -R INT    stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
+		fprintf(stderr, "         -q INT    quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
+        fprintf(stderr, "         -f FILE   file to write output to instead of stdout\n");
+		fprintf(stderr, "         -B INT    length of barcode\n");
+//		fprintf(stderr, "         -c        input sequences are in the color space\n");
+		fprintf(stderr, "         -L        log-scaled gap penalty for long deletions\n");
+		fprintf(stderr, "         -N        non-iterative mode: search for all n-difference hits (slooow)\n");
+		fprintf(stderr, "         -I        the input is in the Illumina 1.3+ FASTQ-like format\n");
+		fprintf(stderr, "         -b        the input read file is in the BAM format\n");
+		fprintf(stderr, "         -0        use single-end reads only (effective with -b)\n");
+		fprintf(stderr, "         -1        use the 1st read in a pair (effective with -b)\n");
+		fprintf(stderr, "         -2        use the 2nd read in a pair (effective with -b)\n");
+		fprintf(stderr, "         -Y        filter Casava-filtered sequences\n");
+		fprintf(stderr, "\n");
+		return 1;
+	}
+	if (opt->fnr > 0.0) {
+		int i, k;
+		for (i = 17, k = 0; i <= 250; ++i) {
+			int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
+			if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
+			k = l;
+		}
+	}
+	if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
+		free(opt);
+		return 0;
+	}
+	bwa_aln_core(prefix, argv[optind+1], opt);
+	free(opt); free(prefix);
+	return 0;
+}
+
+/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
+__cigar_op and __cigar_len while keeping stdaln stand alone */
+bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
+{
+	uint32_t *cigar32;
+	bwa_cigar_t *cigar;
+	int i;
+	cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
+	cigar = (bwa_cigar_t*)cigar32;
+	for (i = 0; i < *n_cigar; ++i)
+                cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
+	return cigar;
+}
+