comparison rDiff/mex/get_reads_direct.cpp @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 /* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */
2
3 #include <stdio.h>
4 #include <assert.h>
5 #include "sam.h"
6 #include "get_reads_direct.h"
7
8 #include <vector>
9 using std::vector;
10 #include <string>
11 using std::string;
12
13 #ifdef __cplusplus
14 extern "C" {
15 #endif
16
17 typedef struct {
18 int beg, end;
19 samfile_t *in;
20 } tmpstruct_t;
21
22 typedef struct {
23 uint64_t u, v;
24 } pair64_t;
25
26 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
27 {
28 uint32_t rbeg = b->core.pos;
29 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
30 return (rend > beg && rbeg < end);
31 }
32
33 pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off);
34
35 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand);
36
37 // callback for bam_plbuf_init()
38 static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
39 {
40 //tmpstruct_t *tmp = (tmpstruct_t*)data;
41 //if ((int)pos >= tmp->beg && (int)pos < tmp->end)
42 // printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n);
43 return 0;
44 }
45 #ifdef __cplusplus
46 }
47 #endif
48 int parse_sam_line(char* line, CRead* read);
49 int set_strand(char c);
50 //void parse_cigar(bam1_t* b, CRead* read);
51 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header);
52
53
54 int get_reads_from_bam(char* filename, char* region, vector<CRead*>* reads, char strand, int lsubsample)
55 {
56 subsample = lsubsample;
57 set_strand(strand);
58
59 srand (time(NULL));
60 //srand (1234);
61 tmpstruct_t tmp;
62 tmp.in = samopen(filename, "rb", 0);
63 if (tmp.in == 0) {
64 fprintf(stderr, "Fail to open BAM file %s\n", filename);
65 return 1;
66 }
67 int ref;
68 bam_index_t *idx;
69 bam_plbuf_t *buf;
70 idx = bam_index_load(filename); // load BAM index
71 if (idx == 0) {
72 fprintf(stderr, "BAM indexing file is not available.\n");
73 return 1;
74 }
75 bam_parse_region(tmp.in->header, region, &ref,
76 &tmp.beg, &tmp.end); // parse the region
77 if (ref < 0) {
78 fprintf(stderr, "Invalid region %s\n", region);
79 return 1;
80 }
81
82 buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup
83
84 bam_fetch_reads(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, tmp.in->header, reads, strand);
85 //fprintf(stdout, "intron_list: %d \n", intron_list->size());
86
87 bam_plbuf_push(0, buf); // finalize pileup
88 bam_index_destroy(idx);
89 bam_plbuf_destroy(buf);
90 samclose(tmp.in);
91 return 0;
92 }
93
94
95 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand)
96 {
97 int n_off;
98 pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
99 if (off == 0) return 0;
100 {
101 // retrive alignments
102 uint64_t curr_off;
103 int i, ret, n_seeks;
104 n_seeks = 0; i = -1; curr_off = 0;
105 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
106 for (;;) {
107 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
108 if (i == n_off - 1) break; // no more chunks
109 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
110 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
111 bam_seek(fp, off[i+1].u, SEEK_SET);
112 curr_off = bam_tell(fp);
113 ++n_seeks;
114 }
115 ++i;
116 }
117 if ((ret = bam_read1(fp, b)) > 0) {
118 curr_off = bam_tell(fp);
119 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
120 else if (is_overlap(beg, end, b))
121 {
122 int rr = rand();
123 if ((rr%1000 < subsample))
124 {
125 CRead* read = new CRead();
126 parse_cigar(b, read, header);
127
128 if (strand == '0' || strand==read->strand[0])
129 {
130 reads->push_back(read);
131 }
132 else if (read->strand[0]=='0')
133 {
134 reads->push_back(read);
135 }
136 //else if (read->strand[0]=='0'&&((b->core.flag & g_flag_off) >0))
137 //{
138 // //fprintf(stdout, "(-)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size());
139 // // this flag means that the read has been reversed for alignment
140 // // flag bit set and (-)-strand requested
141 // reads->push_back(read);
142 //}
143 //else if (read->strand[0]=='0'&&(g_flag_on>0&&(b->core.flag & g_flag_on)==0))
144 //{
145 // //fprintf(stdout, "(+)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size());
146 // // (+)-strand requested and flag bit not set
147 // reads->push_back(read);
148 //}
149 }
150 }
151 } else break; // end of file
152 }
153 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
154 bam_destroy1(b);
155 }
156 free(off);
157 return 0;
158 }
159
160 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header)
161 {
162 read->start_pos = b->core.pos+1;
163 read->set_strand('0');
164
165 for (int k = 0; k < b->core.n_cigar; ++k)
166 {
167 int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
168 int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
169 //fprintf(stdout, "op:%d l:%d\n", op, l);
170 if (op == BAM_CMATCH)
171 {
172 if (k==0)
173 {
174 read->block_lengths.push_back(l);
175 read->block_starts.push_back(0);
176 }
177 else
178 {
179 int op_prev = bam1_cigar(b)[k-1] & BAM_CIGAR_MASK;
180 int l_prev = bam1_cigar(b)[k-1] >> BAM_CIGAR_SHIFT;
181 if (op_prev==BAM_CREF_SKIP)// intron before
182 {
183 if (read->block_lengths.size()>=1)
184 {
185 int last_block_start = (*(read->block_starts.end()-1));
186 int intron_start = last_block_start+(*(read->block_lengths.end()-1));
187 read->block_lengths.push_back(l);
188 read->block_starts.push_back(intron_start+l_prev);
189 }
190 else
191 {
192 // start of first block was not a match
193 read->block_lengths.push_back(l);
194 read->block_starts.push_back(0);
195 }
196 }
197 else
198 {
199 if (read->block_lengths.size()>=1 && op == BAM_CDEL)// if it is an insertion then the matching block is not inreased
200 (*(read->block_lengths.end()-1))+=l;
201 else
202 {
203 //char *samline = bam_format1(header, b);
204 //printf("header: %s \n", samline);
205 }
206 }
207 }
208 }
209 else if (op == BAM_CDEL)
210 {
211 if (k>0 && read->block_lengths.size()>=1)
212 (*(read->block_lengths.end()-1))+=l;
213 }
214 else if (op == BAM_CREF_SKIP)//intron
215 {}
216 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP)
217 {}
218 }
219 // parse auxiliary data
220 uint8_t* s = bam1_aux(b);
221 uint8_t* end = b->data + b->data_len;
222 while (s < end)
223 {
224 uint8_t type, key[2];
225 key[0] = s[0]; key[1] = s[1];
226 s += 2; type = *s; ++s;
227 //fprintf(stdout, "\n%c%c:%c\n", key[0], key[1], type);
228 if (type == 'A')
229 {
230 if ( key[0] =='X' && key[1] == 'S')
231 {
232 read->set_strand((char) *s);
233 }
234 ++s;
235 }
236 else if (type == 'C')
237 {
238 if ( key[0] =='H' && key[1] == '0')
239 {
240 uint8_t matches = *s;
241 read->matches = (int) matches;
242 }
243 if ( key[0] =='N' && key[1] == 'M')
244 {
245 uint8_t mismatches = *s;
246 read->mismatches = (int) mismatches;
247 }
248 if ( key[0] =='H' && key[1] == 'I')
249 {
250 uint8_t mai = *s;
251 read->multiple_alignment_index = (int) mai;
252 }
253
254 ++s;
255 }
256 else if (type == 'c') { ++s; }
257 else if (type == 'S') { s += 2; }
258 else if (type == 's') { s += 2; }
259 else if (type == 'I') { s += 4; }
260 else if (type == 'i') { s += 4; }
261 else if (type == 'f') { s += 4; }
262 else if (type == 'd') { s += 8; }
263 else if (type == 'Z') { ++s; }
264 else if (type == 'H') { ++s; }
265 }
266 }
267
268 int set_strand(char c)
269 {
270 if (c=='+')
271 {
272 char* fl = (char*) "0x0010";
273 g_flag_on = strtol(fl, 0, 0);
274 g_flag_off = 0;
275 }
276 else if (c=='-')
277 {
278 char* fl = (char*) "0x0010";
279 g_flag_off = strtol(fl, 0, 0);
280 g_flag_on = 0;
281 }
282 }
283