Mercurial > repos > vipints > deseq_hts
comparison deseq-hts_1.0/mex/get_reads_direct.cpp @ 0:94a108763d9e draft
deseq-hts version 1.0 wraps the DESeq 1.6.0
author | vipints |
---|---|
date | Wed, 09 May 2012 20:43:47 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:94a108763d9e |
---|---|
1 /* | |
2 * This program is free software; you can redistribute it and/or modify | |
3 * it under the terms of the GNU General Public License as published by | |
4 * the Free Software Foundation; either version 3 of the License, or | |
5 * (at your option) any later version. | |
6 * | |
7 * Written (W) 2010-2011 Jonas Behr, Regina Bohnert, Gunnar Raetsch | |
8 * Copyright (C) 2010-2011 Max Planck Society | |
9 */ | |
10 | |
11 | |
12 #include <stdio.h> | |
13 #include <assert.h> | |
14 #include "sam.h" | |
15 #include "get_reads_direct.h" | |
16 | |
17 #include <vector> | |
18 using std::vector; | |
19 #include <string> | |
20 using std::string; | |
21 | |
22 #ifdef __cplusplus | |
23 extern "C" { | |
24 #endif | |
25 | |
26 typedef struct { | |
27 int beg, end; | |
28 samfile_t *in; | |
29 } tmpstruct_t; | |
30 | |
31 typedef struct { | |
32 uint64_t u, v; | |
33 } pair64_t; | |
34 | |
35 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) | |
36 { | |
37 uint32_t rbeg = b->core.pos; | |
38 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; | |
39 return (rend > beg && rbeg < end); | |
40 } | |
41 | |
42 pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off); | |
43 | |
44 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand); | |
45 | |
46 // callback for bam_plbuf_init() | |
47 static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) | |
48 { | |
49 //tmpstruct_t *tmp = (tmpstruct_t*)data; | |
50 //if ((int)pos >= tmp->beg && (int)pos < tmp->end) | |
51 // printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); | |
52 return 0; | |
53 } | |
54 #ifdef __cplusplus | |
55 } | |
56 #endif | |
57 int parse_sam_line(char* line, CRead* read); | |
58 //int set_strand(char c); | |
59 //void parse_cigar(bam1_t* b, CRead* read); | |
60 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header); | |
61 | |
62 | |
63 int get_reads_from_bam(char* filename, char* region, vector<CRead*>* reads, char strand, int lsubsample) | |
64 { | |
65 subsample = lsubsample; | |
66 //set_strand(strand); | |
67 | |
68 srand (time(NULL)); | |
69 //srand (1234); | |
70 tmpstruct_t tmp; | |
71 tmp.in = samopen(filename, "rb", 0); | |
72 if (tmp.in == 0) { | |
73 fprintf(stderr, "Fail to open BAM file %s\n", filename); | |
74 return 1; | |
75 } | |
76 int ref; | |
77 bam_index_t *idx; | |
78 bam_plbuf_t *buf; | |
79 idx = bam_index_load(filename); // load BAM index | |
80 if (idx == 0) { | |
81 fprintf(stderr, "BAM indexing file is not available.\n"); | |
82 return 1; | |
83 } | |
84 bam_parse_region(tmp.in->header, region, &ref, | |
85 &tmp.beg, &tmp.end); // parse the region | |
86 if (ref < 0) { | |
87 fprintf(stderr, "Invalid region %s\n", region); | |
88 return 1; | |
89 } | |
90 | |
91 buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup | |
92 | |
93 bam_fetch_reads(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, tmp.in->header, reads, strand); | |
94 //fprintf(stdout, "intron_list: %d \n", intron_list->size()); | |
95 | |
96 bam_plbuf_push(0, buf); // finalize pileup | |
97 bam_index_destroy(idx); | |
98 bam_plbuf_destroy(buf); | |
99 samclose(tmp.in); | |
100 return 0; | |
101 } | |
102 | |
103 | |
104 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand) | |
105 { | |
106 int n_off; | |
107 pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); | |
108 if (off == 0) return 0; | |
109 { | |
110 // retrive alignments | |
111 uint64_t curr_off; | |
112 int i, ret, n_seeks; | |
113 n_seeks = 0; i = -1; curr_off = 0; | |
114 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); | |
115 for (;;) { | |
116 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk | |
117 if (i == n_off - 1) break; // no more chunks | |
118 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug | |
119 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek | |
120 bam_seek(fp, off[i+1].u, SEEK_SET); | |
121 curr_off = bam_tell(fp); | |
122 ++n_seeks; | |
123 } | |
124 ++i; | |
125 } | |
126 if ((ret = bam_read1(fp, b)) > 0) { | |
127 curr_off = bam_tell(fp); | |
128 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed | |
129 else if (is_overlap(beg, end, b)) | |
130 { | |
131 int rr = rand(); | |
132 if ((rr%1000 < subsample)) | |
133 { | |
134 CRead* read = new CRead(); | |
135 parse_cigar(b, read, header); | |
136 | |
137 if (strand == '0' || strand==read->strand[0] || read->strand[0]=='0') | |
138 { | |
139 read->left = (b->core.flag & left_flag_mask) >0; | |
140 read->right = (b->core.flag & right_flag_mask) >0; | |
141 read->reverse = (b->core.flag & reverse_flag_mask) >0; | |
142 reads->push_back(read); | |
143 } | |
144 else | |
145 { | |
146 delete read; | |
147 } | |
148 //else if (read->strand[0]=='0'&&((b->core.flag & g_flag_off) >0)) | |
149 //{ | |
150 // //fprintf(stdout, "(-)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size()); | |
151 // // this flag means that the read has been reversed for alignment | |
152 // // flag bit set and (-)-strand requested | |
153 // reads->push_back(read); | |
154 //} | |
155 //else if (read->strand[0]=='0'&&(g_flag_on>0&&(b->core.flag & g_flag_on)==0)) | |
156 //{ | |
157 // //fprintf(stdout, "(+)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size()); | |
158 // // (+)-strand requested and flag bit not set | |
159 // reads->push_back(read); | |
160 //} | |
161 } | |
162 } | |
163 } else break; // end of file | |
164 } | |
165 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); | |
166 bam_destroy1(b); | |
167 } | |
168 free(off); | |
169 return 0; | |
170 } | |
171 | |
172 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header) | |
173 { | |
174 read->start_pos = b->core.pos+1; | |
175 read->set_strand('0'); | |
176 read->read_id = new char[100]; | |
177 sprintf(read->read_id, "%s\0", bam1_qname(b)); | |
178 | |
179 for (int k = 0; k < b->core.n_cigar; ++k) | |
180 { | |
181 int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation | |
182 int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length | |
183 //fprintf(stdout, "op:%d l:%d\n", op, l); | |
184 if (op == BAM_CMATCH) | |
185 { | |
186 if (k==0) | |
187 { | |
188 read->block_lengths.push_back(l); | |
189 read->block_starts.push_back(0); | |
190 } | |
191 else | |
192 { | |
193 int op_prev = bam1_cigar(b)[k-1] & BAM_CIGAR_MASK; | |
194 int l_prev = bam1_cigar(b)[k-1] >> BAM_CIGAR_SHIFT; | |
195 if (op_prev==BAM_CREF_SKIP)// intron before | |
196 { | |
197 if (read->block_lengths.size()>=1) | |
198 { | |
199 int last_block_start = (*(read->block_starts.end()-1)); | |
200 int intron_start = last_block_start+(*(read->block_lengths.end()-1)); | |
201 read->block_lengths.push_back(l); | |
202 read->block_starts.push_back(intron_start+l_prev); | |
203 } | |
204 else | |
205 { | |
206 // start of first block was not a match | |
207 read->block_lengths.push_back(l); | |
208 read->block_starts.push_back(0); | |
209 } | |
210 } | |
211 else | |
212 { | |
213 if (read->block_lengths.size()>=1 && op == BAM_CDEL)// if it is an insertion then the matching block is not inreased | |
214 (*(read->block_lengths.end()-1))+=l; | |
215 else | |
216 { | |
217 //char *samline = bam_format1(header, b); | |
218 //printf("header: %s \n", samline); | |
219 } | |
220 } | |
221 } | |
222 } | |
223 else if (op == BAM_CDEL) | |
224 { | |
225 if (k>0 && read->block_lengths.size()>=1) | |
226 (*(read->block_lengths.end()-1))+=l; | |
227 } | |
228 else if (op == BAM_CREF_SKIP)//intron | |
229 {} | |
230 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) | |
231 {} | |
232 } | |
233 // parse auxiliary data | |
234 uint8_t* s = bam1_aux(b); | |
235 uint8_t* end = b->data + b->data_len; | |
236 while (s < end) | |
237 { | |
238 uint8_t type, key[2]; | |
239 key[0] = s[0]; key[1] = s[1]; | |
240 s += 2; type = *s; ++s; | |
241 //fprintf(stdout, "\n%c%c:%c\n", key[0], key[1], type); | |
242 if (type == 'A') | |
243 { | |
244 if ( key[0] =='X' && key[1] == 'S') | |
245 { | |
246 read->set_strand((char) *s); | |
247 } | |
248 ++s; | |
249 } | |
250 else if (type == 'C') | |
251 { | |
252 if ( key[0] =='H' && key[1] == '0') | |
253 { | |
254 uint8_t matches = *s; | |
255 read->matches = (int) matches; | |
256 } | |
257 if ( key[0] =='N' && key[1] == 'M') | |
258 { | |
259 uint8_t mismatches = *s; | |
260 read->mismatches = (int) mismatches; | |
261 } | |
262 if ( key[0] =='H' && key[1] == 'I') | |
263 { | |
264 uint8_t mai = *s; | |
265 read->multiple_alignment_index = (int) mai; | |
266 } | |
267 | |
268 ++s; | |
269 } | |
270 else if (type == 'c') { ++s; } | |
271 else if (type == 'S') { s += 2; } | |
272 else if (type == 's') { s += 2; } | |
273 else if (type == 'I') { s += 4; } | |
274 else if (type == 'i') { s += 4; } | |
275 else if (type == 'f') { s += 4; } | |
276 else if (type == 'd') { s += 8; } | |
277 else if (type == 'Z') { ++s; } | |
278 else if (type == 'H') { ++s; } | |
279 } | |
280 } | |
281 | |
282 //int set_strand(char c) | |
283 //{ | |
284 // if (c=='+') | |
285 // { | |
286 // char* fl = (char*) "0x0010"; | |
287 // g_flag_on = strtol(fl, 0, 0); | |
288 // g_flag_off = 0; | |
289 // } | |
290 // else if (c=='-') | |
291 // { | |
292 // char* fl = (char*) "0x0010"; | |
293 // g_flag_off = strtol(fl, 0, 0); | |
294 // g_flag_on = 0; | |
295 // } | |
296 // return 0; | |
297 //} | |
298 |