annotate bwa-0.6.2/simple_dp.c @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 #include <stdlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2 #include <stdio.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 #include <unistd.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4 #include <string.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 #include <zlib.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6 #include <stdint.h>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 #include "stdaln.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 #include "utils.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10 #include "kseq.h"
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 KSEQ_INIT(gzFile, gzread)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 int l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15 unsigned char *s;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 char *n;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 } seq1_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19 typedef struct {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 int n_seqs, m_seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21 seq1_t *seqs;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 } seqs_t;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 unsigned char aln_rev_table[256] = {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 };
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 static AlnParam g_aln_param;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 static void revseq(int len, uint8_t *seq)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 for (i = 0; i < len>>1; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 uint8_t tmp = aln_rev_table[seq[len-1-i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 seq[len-1-i] = aln_rev_table[seq[i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 seq[i] = tmp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 if (len&1) seq[i] = aln_rev_table[seq[i]];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 static seqs_t *load_seqs(const char *fn)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 seqs_t *s;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 seq1_t *p;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 gzFile fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 int l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 kseq_t *seq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 fp = xzopen(fn, "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 seq = kseq_init(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 s = (seqs_t*)calloc(1, sizeof(seqs_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 s->m_seqs = 256;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 while ((l = kseq_read(seq)) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 if (s->n_seqs == s->m_seqs) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 s->m_seqs <<= 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 p = s->seqs + (s->n_seqs++);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76 p->l = seq->seq.l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 p->s = (unsigned char*)malloc(p->l + 1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 memcpy(p->s, seq->seq.s, p->l);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 p->s[p->l] = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 p->n = strdup((const char*)seq->name.s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 kseq_destroy(seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 gzclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 return s;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88 static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 int i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 for (i = 0; i < ss->n_seqs; ++i) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92 AlnAln *aa;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 seq1_t *p = ss->seqs + i;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 g_aln_param.band_width = l + p->l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 if (aa->score >= g_thres || g_is_global) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97 printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 // NB: I put the short sequence as the first sequence in SW, an insertion to
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 // the reference becomes a deletion from the short sequence. Therefore, I use
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 // "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102 for (i = 0; i != aa->n_cigar; ++i)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106 aln_free_AlnAln(aa);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 static void aln_seqs(const seqs_t *ss, const char *fn)
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
112 gzFile fp;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
113 kseq_t *seq;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
114 int l;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
115
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
116 fp = xzopen(fn, "r");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
117 seq = kseq_init(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
118 while ((l = kseq_read(seq)) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
119 if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
120 if (g_strand&2) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
121 revseq(l, (uint8_t*)seq->seq.s);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
122 aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
123 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
124 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
125 kseq_destroy(seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
126 gzclose(fp);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
127 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
128
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
129 int bwa_stdsw(int argc, char *argv[])
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
130 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
131 int c;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
132 seqs_t *ss;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
133
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
134 while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
135 switch (c) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
136 case 'g': g_is_global = 1; break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
137 case 'T': g_thres = atoi(optarg); break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
138 case 'f': g_strand |= 1; break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
139 case 'r': g_strand |= 2; break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
140 case 'p': g_aa = 1; break;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
141 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
142 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
143 if (g_strand == 0) g_strand = 3;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
144 if (g_aa) g_strand = 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
145 if (optind + 1 >= argc) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
146 fprintf(stderr, "\nUsage: bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
147 fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
148 fprintf(stderr, " -p protein alignment (suppressing -r)\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
149 fprintf(stderr, " -f forward strand only\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
150 fprintf(stderr, " -r reverse strand only\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
151 fprintf(stderr, " -g global alignment\n\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
152 fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
153 fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
154 fprintf(stderr, " sequence.\n\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
155 return 1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
156 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
157 g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
158 g_aln_param.gap_end = 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
159 ss = load_seqs(argv[optind]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
160 aln_seqs(ss, argv[optind+1]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
161 return 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
162 }