diff bwa-0.6.2/bntseq.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bwa-0.6.2/bntseq.c	Fri Jul 18 07:55:59 2014 -0400
@@ -0,0 +1,323 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <unistd.h>
+#include "bntseq.h"
+#include "main.h"
+#include "utils.h"
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+unsigned char nst_nt4_table[256] = {
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 5 /*'-'*/, 4, 4,
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+void bns_dump(const bntseq_t *bns, const char *prefix)
+{
+	char str[1024];
+	FILE *fp;
+	int i;
+	{ // dump .ann
+		strcpy(str, prefix); strcat(str, ".ann");
+		fp = xopen(str, "w");
+		fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
+		for (i = 0; i != bns->n_seqs; ++i) {
+			bntann1_t *p = bns->anns + i;
+			fprintf(fp, "%d %s", p->gi, p->name);
+			if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
+			else fprintf(fp, "\n");
+			fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
+		}
+		fclose(fp);
+	}
+	{ // dump .amb
+		strcpy(str, prefix); strcat(str, ".amb");
+		fp = xopen(str, "w");
+		fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
+		for (i = 0; i != bns->n_holes; ++i) {
+			bntamb1_t *p = bns->ambs + i;
+			fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
+		}
+		fclose(fp);
+	}
+}
+
+bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
+{
+	char str[1024];
+	FILE *fp;
+	bntseq_t *bns;
+	long long xx;
+	int i;
+	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
+	{ // read .ann
+		fp = xopen(ann_filename, "r");
+		fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
+		bns->l_pac = xx;
+		bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
+		for (i = 0; i < bns->n_seqs; ++i) {
+			bntann1_t *p = bns->anns + i;
+			char *q = str;
+			int c;
+			// read gi and sequence name
+			fscanf(fp, "%u%s", &p->gi, str);
+			p->name = strdup(str);
+			// read fasta comments 
+			while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
+			*q = 0;
+			if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
+			else p->anno = strdup("");
+			// read the rest
+			fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
+			p->offset = xx;
+		}
+		fclose(fp);
+	}
+	{ // read .amb
+		int64_t l_pac;
+		int32_t n_seqs;
+		fp = xopen(amb_filename, "r");
+		fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
+		l_pac = xx;
+		xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
+		bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
+		for (i = 0; i < bns->n_holes; ++i) {
+			bntamb1_t *p = bns->ambs + i;
+			fscanf(fp, "%lld%d%s", &xx, &p->len, str);
+			p->offset = xx;
+			p->amb = str[0];
+		}
+		fclose(fp);
+	}
+	{ // open .pac
+		bns->fp_pac = xopen(pac_filename, "rb");
+	}
+	return bns;
+}
+
+bntseq_t *bns_restore(const char *prefix)
+{  
+	char ann_filename[1024], amb_filename[1024], pac_filename[1024];
+	strcat(strcpy(ann_filename, prefix), ".ann");
+	strcat(strcpy(amb_filename, prefix), ".amb");
+	strcat(strcpy(pac_filename, prefix), ".pac");
+	return bns_restore_core(ann_filename, amb_filename, pac_filename);
+}
+
+void bns_destroy(bntseq_t *bns)
+{
+	if (bns == 0) return;
+	else {
+		int i;
+		if (bns->fp_pac) fclose(bns->fp_pac);
+		free(bns->ambs);
+		for (i = 0; i < bns->n_seqs; ++i) {
+			free(bns->anns[i].name);
+			free(bns->anns[i].anno);
+		}
+		free(bns->anns);
+		free(bns);
+	}
+}
+
+#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
+#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
+
+static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
+{
+	bntann1_t *p;
+	int i, lasts;
+	if (bns->n_seqs == *m_seqs) {
+		*m_seqs <<= 1;
+		bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
+	}
+	p = bns->anns + bns->n_seqs;
+	p->name = strdup((char*)seq->name.s);
+	p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
+	p->gi = 0; p->len = seq->seq.l;
+	p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
+	p->n_ambs = 0;
+	for (i = lasts = 0; i < seq->seq.l; ++i) {
+		int c = nst_nt4_table[(int)seq->seq.s[i]];
+		if (c >= 4) { // N
+			if (lasts == seq->seq.s[i]) { // contiguous N
+				++(*q)->len;
+			} else {
+				if (bns->n_holes == *m_holes) {
+					(*m_holes) <<= 1;
+					bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
+				}
+				*q = bns->ambs + bns->n_holes;
+				(*q)->len = 1;
+				(*q)->offset = p->offset + i;
+				(*q)->amb = seq->seq.s[i];
+				++p->n_ambs;
+				++bns->n_holes;
+			}
+		}
+		lasts = seq->seq.s[i];
+		{ // fill buffer
+			if (c >= 4) c = lrand48()&3;
+			if (bns->l_pac == *m_pac) { // double the pac size
+				*m_pac <<= 1;
+				pac = realloc(pac, *m_pac/4);
+				memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
+			}
+			_set_pac(pac, bns->l_pac, c);
+			++bns->l_pac;
+		}
+	}
+	++bns->n_seqs;
+	return pac;
+}
+
+int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
+{
+	extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
+	kseq_t *seq;
+	char name[1024];
+	bntseq_t *bns;
+	uint8_t *pac = 0;
+	int32_t m_seqs, m_holes;
+	int64_t ret = -1, m_pac, l;
+	bntamb1_t *q;
+	FILE *fp;
+
+	// initialization
+	seq = kseq_init(fp_fa);
+	bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
+	bns->seed = 11; // fixed seed for random generator
+	srand48(bns->seed);
+	m_seqs = m_holes = 8; m_pac = 0x10000;
+	bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
+	bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
+	pac = calloc(m_pac/4, 1);
+	q = bns->ambs;
+	strcpy(name, prefix); strcat(name, ".pac");
+	fp = xopen(name, "wb");
+	// read sequences
+	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
+	if (!for_only) { // add the reverse complemented sequence
+		m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
+		pac = realloc(pac, m_pac/4);
+		memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
+		for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
+			_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
+	}
+	ret = bns->l_pac;
+	{ // finalize .pac file
+		ubyte_t ct;
+		fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
+		// the following codes make the pac file size always (l_pac/4+1+1)
+		if (bns->l_pac % 4 == 0) {
+			ct = 0;
+			fwrite(&ct, 1, 1, fp);
+		}
+		ct = bns->l_pac % 4;
+		fwrite(&ct, 1, 1, fp);
+		// close .pac file
+		fclose(fp);
+	}
+	bns_dump(bns, prefix);
+	bns_destroy(bns);
+	kseq_destroy(seq);
+	free(pac);
+	return ret;
+}
+
+int bwa_fa2pac(int argc, char *argv[])
+{
+	int c, for_only = 0;
+	gzFile fp;
+	while ((c = getopt(argc, argv, "f")) >= 0) {
+		switch (c) {
+			case 'f': for_only = 1; break;
+		}
+	}
+	if (argc == optind) {
+		fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
+		return 1;
+	}
+	fp = xzopen(argv[optind], "r");
+	bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
+	gzclose(fp);
+	return 0;
+}
+
+int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
+{
+	int left, mid, right, nn;
+	if (ref_id) {
+		left = 0; mid = 0; right = bns->n_seqs;
+		while (left < right) {
+			mid = (left + right) >> 1;
+			if (pos_f >= bns->anns[mid].offset) {
+				if (mid == bns->n_seqs - 1) break;
+				if (pos_f < bns->anns[mid+1].offset) break; // bracketed
+				left = mid + 1;
+			} else right = mid;
+		}
+		*ref_id = mid;
+	}
+	left = 0; right = bns->n_holes; nn = 0;
+	while (left < right) {
+		mid = (left + right) >> 1;
+		if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
+		else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
+		else { // overlap
+			if (pos_f >= bns->ambs[mid].offset) {
+				nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
+					bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
+			} else {
+				nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
+					bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
+			}
+			break;
+		}
+	}
+	return nn;
+}