annotate bwa-0.6.2/bwase.c @ 0:dd1186b11b3b draft

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:14 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 #include <unistd.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2 #include <string.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 #include <stdio.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4 #include <stdlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 #include <math.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 #include <time.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 #include "stdaln.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 #include "bwase.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 #include "bwtaln.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10 #include "bntseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 #include "utils.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12 #include "kstring.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 int g_log_n[256];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15 char *bwa_rg_line, *bwa_rg_id;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 void bwa_print_sam_PG();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19 void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 int i, cnt, best;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 if (n_aln == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 s->type = BWA_TYPE_NO_MATCH;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 s->c1 = s->c2 = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25 return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 if (set_main) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 best = aln[0].score;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30 for (i = cnt = 0; i < n_aln; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 const bwt_aln1_t *p = aln + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 if (p->score > best) break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 s->score = p->score;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36 s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 cnt += p->l - p->k + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 s->c1 = cnt;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42 s->c2 = cnt - s->c1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 if (n_multi) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 int k, rest, n_occ, z = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 for (k = n_occ = 0; k < n_aln; ++k) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 const bwt_aln1_t *q = aln + k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 n_occ += q->l - q->k + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 if (s->multi) free(s->multi);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 s->multi = 0; s->n_multi = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 /* The following code is more flexible than what is required
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 * here. In principle, due to the requirement above, we can
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 * simply output all hits, but the following samples "rest"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 * number of random hits. */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 s->multi = calloc(rest, sizeof(bwt_multi1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 for (k = 0; k < n_aln; ++k) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 const bwt_aln1_t *q = aln + k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 if (q->l - q->k + 1 <= rest) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 bwtint_t l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 for (l = q->k; l <= q->l; ++l) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 s->multi[z].pos = l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 s->multi[z].gap = q->n_gapo + q->n_gape;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 s->multi[z++].mm = q->n_mm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 rest -= q->l - q->k + 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 int j, i, k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76 double p = 1.0, x = drand48();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 while (x < p) p -= p * j / (i--);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 s->multi[z].pos = q->l - i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 s->multi[z].gap = q->n_gapo + q->n_gape;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 s->multi[z++].mm = q->n_mm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 rest = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 s->n_multi = z;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92 bwa_aln2seq_core(n_aln, aln, s, 1, 0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97 int n;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 if (p->c1 == 0) return 23;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 if (p->c1 > 1) return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 if (p->n_mm == mm) return 25;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 if (p->c2 == 0) return 37;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102 n = (p->c2 >= 255)? 255 : p->c2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106 bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 bwtint_t pos_f;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 int is_rev;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 *strand = !is_rev;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 /* NB: For gapped alignment, pacpos may not be correct, which will be fixed
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 * in bwa_refine_gapped_core(). This line also determines the way "x" is
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114 * calculated in bwa_refine_gapped_core() when (ext < 0 && is_end == 0). */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115 if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119 /**
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120 * Derive the actual position in the read from the given suffix array
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 * coordinates. Note that the position will be approximate based on
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122 * whether indels appear in the read and whether calculations are
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 * performed from the start or end of the read.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125 void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 int max_diff, strand;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
128 if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
129 max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
130 seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
131 seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
132 seq->strand = strand;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
133 seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
134 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
135
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
136 void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
137 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
138 int i, j, strand, n_multi;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
139 char str[1024];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
140 bwt_t *bwt;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
141 // load forward SA
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
142 strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
143 strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
144 for (i = 0; i != n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
145 bwa_seq_t *p = &seqs[i];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
146 bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
147 for (j = n_multi = 0; j < p->n_multi; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
148 bwt_multi1_t *q = p->multi + j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
149 q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len, &strand);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
150 q->strand = strand;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
151 if (q->pos != p->pos)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
152 p->multi[n_multi++] = *q;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
153 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
154 p->n_multi = n_multi;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
155 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
156 bwt_destroy(bwt);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
157 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
158
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
159 /* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
160 * forward strand. This happens when p->pos is calculated by
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
161 * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
162 * coordinate. This happens only for color-converted alignment. */
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
163 bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
164 int ext, int *n_cigar, int is_end_correct)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
165 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
166 bwa_cigar_t *cigar = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
167 ubyte_t *ref_seq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
168 int l = 0, path_len, ref_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
169 AlnParam ap = aln_param_bwa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
170 path_t *path;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
171 int64_t k, __pos = *_pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
172
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
173 ref_len = len + abs(ext);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
174 if (ext > 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
175 ref_seq = (ubyte_t*)calloc(ref_len, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
176 for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
177 ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
178 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
179 int64_t x = __pos + (is_end_correct? len : ref_len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
180 ref_seq = (ubyte_t*)calloc(ref_len, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
181 for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
182 ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
183 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
184 path = (path_t*)calloc(l+len, sizeof(path_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
185
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
186 aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
187 cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
188
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
189 if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
190 for (l = k = 0; k < *n_cigar; ++k) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
191 if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
192 else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
193 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
194 __pos += l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
195 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
196
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
197 if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
198 __pos += __cigar_len(cigar[0]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
199 for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
200 --(*n_cigar);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
201 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
202 if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
203
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
204 // change "I" at either end of the read to S. just in case. This should rarely happen...
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
205 if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
206 if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
207
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
208 *_pos = (bwtint_t)__pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
209 free(ref_seq); free(path);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
210 return cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
211 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
212
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
213 char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
214 bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
215 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
216 bwtint_t x, y;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
217 int z, u, c, nm = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
218 str->l = 0; // reset
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
219 x = pos; y = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
220 if (cigar) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
221 int k, l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
222 for (k = u = 0; k < n_cigar; ++k) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
223 l = __cigar_len(cigar[k]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
224 if (__cigar_op(cigar[k]) == FROM_M) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
225 for (z = 0; z < l && x+z < l_pac; ++z) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
226 c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
227 if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
228 ksprintf(str, "%d", u);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
229 kputc("ACGTN"[c], str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
230 ++nm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
231 u = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
232 } else ++u;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
233 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
234 x += l; y += l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
235 } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
236 y += l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
237 if (__cigar_op(cigar[k]) == FROM_I) nm += l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
238 } else if (__cigar_op(cigar[k]) == FROM_D) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
239 ksprintf(str, "%d", u);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
240 kputc('^', str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
241 for (z = 0; z < l && x+z < l_pac; ++z)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
242 kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
243 u = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
244 x += l; nm += l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
245 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
246 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
247 } else { // no gaps
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
248 for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
249 c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
250 if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
251 ksprintf(str, "%d", u);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
252 kputc("ACGTN"[c], str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
253 ++nm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
254 u = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
255 } else ++u;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
256 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
257 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
258 ksprintf(str, "%d", u);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
259 *_nm = nm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
260 return strdup(str->s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
261 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
262
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
263 void bwa_correct_trimmed(bwa_seq_t *s)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
264 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
265 if (s->len == s->full_len) return;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
266 if (s->strand == 0) { // forward
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
267 if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
268 s->cigar[s->n_cigar-1] += s->full_len - s->len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
269 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
270 if (s->cigar == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
271 s->n_cigar = 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
272 s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
273 s->cigar[0] = __cigar_create(0, s->len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
274 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
275 ++s->n_cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
276 s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
277 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
278 s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
279 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
280 } else { // reverse
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
281 if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
282 s->cigar[0] += s->full_len - s->len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
283 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
284 if (s->cigar == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
285 s->n_cigar = 2;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
286 s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
287 s->cigar[1] = __cigar_create(0, s->len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
288 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
289 ++s->n_cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
290 s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
291 memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
292 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
293 s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
294 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
295 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
296 s->len = s->full_len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
297 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
298
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
299 void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
300 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
301 ubyte_t *pacseq, *ntpac = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
302 int i, j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
303 kstring_t *str;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
304
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
305 if (ntbns) { // in color space
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
306 ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
307 rewind(ntbns->fp_pac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
308 fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
309 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
310
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
311 if (!_pacseq) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
312 pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
313 rewind(bns->fp_pac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
314 fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
315 } else pacseq = _pacseq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
316 for (i = 0; i != n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
317 bwa_seq_t *s = seqs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
318 seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
319 for (j = 0; j < s->n_multi; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
320 bwt_multi1_t *q = s->multi + j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
321 int n_cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
322 if (q->gap == 0) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
323 q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
324 (q->strand? 1 : -1) * q->gap, &n_cigar, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
325 q->n_cigar = n_cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
326 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
327 if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
328 s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
329 (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
330 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
331 #if 0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
332 if (ntbns) { // in color space
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
333 for (i = 0; i < n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
334 bwa_seq_t *s = seqs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
335 bwa_cs2nt_core(s, bns->l_pac, ntpac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
336 for (j = 0; j < s->n_multi; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
337 bwt_multi1_t *q = s->multi + j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
338 int n_cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
339 if (q->gap == 0) continue;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
340 free(q->cigar);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
341 q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
342 (q->strand? 1 : -1) * q->gap, &n_cigar, 0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
343 q->n_cigar = n_cigar;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
344 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
345 if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
346 free(s->cigar);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
347 s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
348 (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
349 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
350 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
351 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
352 #endif
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
353 // generate MD tag
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
354 str = (kstring_t*)calloc(1, sizeof(kstring_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
355 for (i = 0; i != n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
356 bwa_seq_t *s = seqs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
357 if (s->type != BWA_TYPE_NO_MATCH) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
358 int nm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
359 s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
360 bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
361 s->nm = nm;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
362 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
363 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
364 free(str->s); free(str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
365
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
366 // correct for trimmed reads
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
367 if (!ntbns) // trimming is only enabled for Illumina reads
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
368 for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
369
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
370 if (!_pacseq) free(pacseq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
371 free(ntpac);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
372 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
373
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
374 int64_t pos_end(const bwa_seq_t *p)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
375 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
376 if (p->cigar) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
377 int j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
378 int64_t x = p->pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
379 for (j = 0; j != p->n_cigar; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
380 int op = __cigar_op(p->cigar[j]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
381 if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
382 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
383 return x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
384 } else return p->pos + p->len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
385 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
386
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
387 int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
388 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
389 if (p->cigar) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
390 int j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
391 int64_t x = p->pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
392 for (j = 0; j != p->n_cigar; ++j) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
393 int op = __cigar_op(p->cigar[j]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
394 if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
395 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
396 return x;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
397 } else return p->pos + len;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
398 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
399
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
400 static int64_t pos_5(const bwa_seq_t *p)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
401 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
402 if (p->type != BWA_TYPE_NO_MATCH)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
403 return p->strand? pos_end(p) : p->pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
404 return -1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
405 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
406
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
407 void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
408 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
409 int j;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
410 if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
411 int seqid, nn, am = 0, flag = p->extra_flag;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
412 char XT;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
413
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
414 if (p->type == BWA_TYPE_NO_MATCH) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
415 p->pos = mate->pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
416 p->strand = mate->strand;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
417 flag |= SAM_FSU;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
418 j = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
419 } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
420
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
421 // get seqid
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
422 nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
423 if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
424 flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
425
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
426 // update flag and print it
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
427 if (p->strand) flag |= SAM_FSR;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
428 if (mate) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
429 if (mate->type != BWA_TYPE_NO_MATCH) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
430 if (mate->strand) flag |= SAM_FMR;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
431 } else flag |= SAM_FMU;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
432 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
433 err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
434 err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
435
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
436 // print CIGAR
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
437 if (p->cigar) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
438 for (j = 0; j != p->n_cigar; ++j)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
439 err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
440 } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
441 else err_printf("%dM", p->len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
442
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
443 // print mate coordinate
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
444 if (mate && mate->type != BWA_TYPE_NO_MATCH) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
445 int m_seqid, m_is_N;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
446 long long isize;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
447 am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
448 // redundant calculation here, but should not matter too much
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
449 m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
450 err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
451 isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
452 if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
453 err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
454 } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
455 else err_printf("\t*\t0\t0\t");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
456
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
457 // print sequence and quality
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
458 if (p->strand == 0)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
459 for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
460 else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
461 putchar('\t');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
462 if (p->qual) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
463 if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
464 err_printf("%s", p->qual);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
465 } else err_printf("*");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
466
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
467 if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
468 if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
469 if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
470 if (p->type != BWA_TYPE_NO_MATCH) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
471 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
472 // calculate XT tag
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
473 XT = "NURM"[p->type];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
474 if (nn > 10) XT = 'N';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
475 // print tags
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
476 err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
477 if (nn) err_printf("\tXN:i:%d", nn);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
478 if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
479 if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
480 err_printf("\tX0:i:%d", p->c1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
481 if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
482 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
483 err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
484 if (p->md) err_printf("\tMD:Z:%s", p->md);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
485 // print multiple hits
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
486 if (p->n_multi) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
487 err_printf("\tXA:Z:");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
488 for (i = 0; i < p->n_multi; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
489 bwt_multi1_t *q = p->multi + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
490 int k;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
491 j = pos_end_multi(q, p->len) - q->pos;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
492 nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
493 err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
494 (int)(q->pos - bns->anns[seqid].offset + 1));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
495 if (q->cigar) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
496 for (k = 0; k < q->n_cigar; ++k)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
497 err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
498 } else err_printf("%dM", p->len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
499 err_printf(",%d;", q->gap + q->mm);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
500 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
501 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
502 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
503 putchar('\n');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
504 } else { // this read has no match
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
505 ubyte_t *s = p->strand? p->rseq : p->seq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
506 int flag = p->extra_flag | SAM_FSU;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
507 if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
508 err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
509 for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
510 putchar('\t');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
511 if (p->qual) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
512 if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
513 err_printf("%s", p->qual);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
514 } else err_printf("*");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
515 if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
516 if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
517 if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
518 putchar('\n');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
519 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
520 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
521
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
522 bntseq_t *bwa_open_nt(const char *prefix)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
523 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
524 bntseq_t *ntbns;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
525 char *str;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
526 str = (char*)calloc(strlen(prefix) + 10, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
527 strcat(strcpy(str, prefix), ".nt");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
528 ntbns = bns_restore(str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
529 free(str);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
530 return ntbns;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
531 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
532
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
533 void bwa_print_sam_SQ(const bntseq_t *bns)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
534 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
535 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
536 for (i = 0; i < bns->n_seqs; ++i)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
537 err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
538 if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
539 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
540
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
541 void bwase_initialize()
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
542 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
543 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
544 for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
545 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
546
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
547 char *bwa_escape(char *s)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
548 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
549 char *p, *q;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
550 for (p = q = s; *p; ++p) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
551 if (*p == '\\') {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
552 ++p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
553 if (*p == 't') *q++ = '\t';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
554 else if (*p == 'n') *q++ = '\n';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
555 else if (*p == 'r') *q++ = '\r';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
556 else if (*p == '\\') *q++ = '\\';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
557 } else *q++ = *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
558 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
559 *q = '\0';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
560 return s;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
561 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
562
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
563 int bwa_set_rg(const char *s)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
564 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
565 char *p, *q, *r;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
566 if (strstr(s, "@RG") != s) return -1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
567 if (bwa_rg_line) free(bwa_rg_line);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
568 if (bwa_rg_id) free(bwa_rg_id);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
569 bwa_rg_line = strdup(s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
570 bwa_rg_id = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
571 bwa_escape(bwa_rg_line);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
572 p = strstr(bwa_rg_line, "\tID:");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
573 if (p == 0) return -1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
574 p += 4;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
575 for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
576 bwa_rg_id = calloc(q - p + 1, 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
577 for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
578 *r++ = *q;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
579 return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
580 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
581
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
582 void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
583 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
584 extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
585 int i, n_seqs, tot_seqs = 0, m_aln;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
586 bwt_aln1_t *aln = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
587 bwa_seq_t *seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
588 bwa_seqio_t *ks;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
589 clock_t t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
590 bntseq_t *bns, *ntbns = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
591 FILE *fp_sa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
592 gap_opt_t opt;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
593
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
594 // initialization
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
595 bwase_initialize();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
596 bns = bns_restore(prefix);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
597 srand48(bns->seed);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
598 fp_sa = xopen(fn_sa, "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
599
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
600 m_aln = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
601 fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
602 if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
603 ntbns = bwa_open_nt(prefix);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
604 bwa_print_sam_SQ(bns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
605 //bwa_print_sam_PG();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
606 // set ks
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
607 ks = bwa_open_reads(opt.mode, fn_fa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
608 // core loop
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
609 while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
610 tot_seqs += n_seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
611 t = clock();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
612
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
613 // read alignment
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
614 for (i = 0; i < n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
615 bwa_seq_t *p = seqs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
616 int n_aln;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
617 fread(&n_aln, 4, 1, fp_sa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
618 if (n_aln > m_aln) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
619 m_aln = n_aln;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
620 aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
621 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
622 fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
623 bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
624 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
625
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
626 fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
627 bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
628 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
629
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
630 fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
631 bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
632 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
633
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
634 fprintf(stderr, "[bwa_aln_core] print alignments... ");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
635 for (i = 0; i < n_seqs; ++i)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
636 bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
637 fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
638
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
639 bwa_free_read_seq(n_seqs, seqs);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
640 fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
641 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
642
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
643 // destroy
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
644 bwa_seq_close(ks);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
645 if (ntbns) bns_destroy(ntbns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
646 bns_destroy(bns);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
647 fclose(fp_sa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
648 free(aln);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
649 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
650
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
651 int bwa_sai2sam_se(int argc, char *argv[])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
652 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
653 extern char *bwa_infer_prefix(const char *hint);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
654 int c, n_occ = 3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
655 char *prefix;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
656 while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
657 switch (c) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
658 case 'h': break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
659 case 'r':
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
660 if (bwa_set_rg(optarg) < 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
661 fprintf(stderr, "[%s] malformated @RG line\n", __func__);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
662 return 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
663 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
664 break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
665 case 'n': n_occ = atoi(optarg); break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
666 case 'f': xreopen(optarg, "w", stdout); break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
667 default: return 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
668 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
669 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
670
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
671 if (optind + 3 > argc) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
672 fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
673 return 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
674 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
675 if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
676 fprintf(stderr, "[%s] fail to locate the index\n", __func__);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
677 free(bwa_rg_line); free(bwa_rg_id);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
678 return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
679 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
680 bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
681 free(bwa_rg_line); free(bwa_rg_id);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
682 return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
683 }