annotate bwa-0.6.2/fastmap.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 #include <zlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2 #include <stdio.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 #include <unistd.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4 #include <stdlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 #include "bntseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 #include "bwt.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 #include "kvec.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 #include "kseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 KSEQ_INIT(gzFile, gzread)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 extern unsigned char nst_nt4_table[256];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 const bwt_t *bwt;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15 const uint8_t *query;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 int start, len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 bwtintv_v *tmpvec[2], *matches;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18 } smem_i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 smem_i *smem_iter_init(const bwt_t *bwt)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 smem_i *iter;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 iter = calloc(1, sizeof(smem_i));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 iter->bwt = bwt;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25 iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27 iter->matches = calloc(1, sizeof(bwtintv_v));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 return iter;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 void smem_iter_destroy(smem_i *iter)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 free(iter->tmpvec[0]->a);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 free(iter->tmpvec[1]->a);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 free(iter->matches->a);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36 free(iter);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 void smem_set_query(smem_i *iter, int len, const uint8_t *query)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 iter->query = query;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42 iter->start = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 iter->len = len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 int smem_next(smem_i *iter)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 if (iter->start >= iter->len || iter->start < 0) return -1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 if (iter->start == iter->len) return -1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 return iter->start;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56 int main_fastmap(int argc, char *argv[])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 kseq_t *seq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 bwtint_t k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 gzFile fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 bwt_t *bwt;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 bntseq_t *bns;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 smem_i *iter;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 switch (c) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 case 's': print_seq = 1; break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 case 'w': min_iwidth = atoi(optarg); break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 case 'l': min_len = atoi(optarg); break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 if (optind + 1 >= argc) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 return 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 fp = gzopen(argv[optind + 1], "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 seq = kseq_init(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 { // load the packed sequences, BWT and SA
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 char *tmp = calloc(strlen(argv[optind]) + 5, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 strcat(strcpy(tmp, argv[optind]), ".bwt");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 bwt = bwt_restore_bwt(tmp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 strcat(strcpy(tmp, argv[optind]), ".sa");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 bwt_restore_sa(tmp, bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 free(tmp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87 bns = bns_restore(argv[optind]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89 iter = smem_iter_init(bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 while (kseq_read(seq) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92 if (print_seq) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 putchar('\t');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 puts(seq->seq.s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 } else putchar('\n');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 for (i = 0; i < seq->seq.l; ++i)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97 seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 while (smem_next(iter) > 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 for (i = 0; i < iter->matches->n; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 bwtintv_t *p = &iter->matches->a[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102 if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 if (p->x[2] <= min_iwidth) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 for (k = 0; k < p->x[2]; ++k) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106 bwtint_t pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 int len, is_rev, ref_id;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 len = (uint32_t)p->info - (p->info>>32);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 if (is_rev) pos -= len - 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 bns_cnt_ambi(bns, pos, len, &ref_id);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114 } else fputs("\t*", stdout);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115 putchar('\n');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118 puts("//");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 smem_iter_destroy(iter);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122 bns_destroy(bns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 bwt_destroy(bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 kseq_destroy(seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125 gzclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 }