| 
0
 | 
     1 /* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 #include <stdio.h>
 | 
| 
 | 
     4 #include <assert.h>
 | 
| 
 | 
     5 #include "sam.h"
 | 
| 
 | 
     6 #include "get_reads_direct.h"
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 #include <vector>
 | 
| 
 | 
     9   using std::vector;
 | 
| 
 | 
    10 #include <string>
 | 
| 
 | 
    11   using std::string;
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 #ifdef __cplusplus
 | 
| 
 | 
    14 extern "C" {
 | 
| 
 | 
    15 #endif
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 typedef struct {
 | 
| 
 | 
    18 	int beg, end;
 | 
| 
 | 
    19 	samfile_t *in;
 | 
| 
 | 
    20 } tmpstruct_t;
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 typedef struct {
 | 
| 
 | 
    23     uint64_t u, v;
 | 
| 
 | 
    24 } pair64_t;
 | 
| 
 | 
    25 
 | 
| 
 | 
    26 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
 | 
| 
 | 
    27 {
 | 
| 
 | 
    28     uint32_t rbeg = b->core.pos;
 | 
| 
 | 
    29     uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
 | 
| 
 | 
    30     return (rend > beg && rbeg < end);
 | 
| 
 | 
    31 }
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off);
 | 
| 
 | 
    34 
 | 
| 
 | 
    35   int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand);
 | 
| 
 | 
    36 
 | 
| 
 | 
    37 // callback for bam_plbuf_init()
 | 
| 
 | 
    38 static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
 | 
| 
 | 
    39 {
 | 
| 
 | 
    40 	//tmpstruct_t *tmp = (tmpstruct_t*)data;
 | 
| 
 | 
    41 	//if ((int)pos >= tmp->beg && (int)pos < tmp->end)
 | 
| 
 | 
    42 	//	printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n);
 | 
| 
 | 
    43 	return 0;
 | 
| 
 | 
    44 }
 | 
| 
 | 
    45 #ifdef __cplusplus
 | 
| 
 | 
    46 }
 | 
| 
 | 
    47 #endif
 | 
| 
 | 
    48 int parse_sam_line(char* line, CRead* read);
 | 
| 
 | 
    49 int set_strand(char c);
 | 
| 
 | 
    50 //void parse_cigar(bam1_t* b, CRead* read);
 | 
| 
 | 
    51 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header);
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 int get_reads_from_bam(char* filename, char* region, vector<CRead*>* reads, char strand, int lsubsample)
 | 
| 
 | 
    55 {
 | 
| 
 | 
    56 	subsample = lsubsample;
 | 
| 
 | 
    57 	set_strand(strand);
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 	srand (time(NULL));
 | 
| 
 | 
    60 	//srand (1234);
 | 
| 
 | 
    61 	tmpstruct_t tmp;
 | 
| 
 | 
    62 	tmp.in = samopen(filename, "rb", 0);
 | 
| 
 | 
    63 	if (tmp.in == 0) {
 | 
| 
 | 
    64 		fprintf(stderr, "Fail to open BAM file %s\n", filename);
 | 
| 
 | 
    65 		return 1;
 | 
| 
 | 
    66 	}
 | 
| 
 | 
    67 	int ref;
 | 
| 
 | 
    68 	bam_index_t *idx;
 | 
| 
 | 
    69 	bam_plbuf_t *buf;
 | 
| 
 | 
    70 	idx = bam_index_load(filename); // load BAM index
 | 
| 
 | 
    71 	if (idx == 0) {
 | 
| 
 | 
    72 		fprintf(stderr, "BAM indexing file is not available.\n");
 | 
| 
 | 
    73 		return 1;
 | 
| 
 | 
    74 	}
 | 
| 
 | 
    75 	bam_parse_region(tmp.in->header, region, &ref,
 | 
| 
 | 
    76 	                 &tmp.beg, &tmp.end); // parse the region
 | 
| 
 | 
    77 	if (ref < 0) {
 | 
| 
 | 
    78 		fprintf(stderr, "Invalid region %s\n", region);
 | 
| 
 | 
    79 		return 1;
 | 
| 
 | 
    80 	}
 | 
| 
 | 
    81 
 | 
| 
 | 
    82 	buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup
 | 
| 
 | 
    83 
 | 
| 
 | 
    84 	bam_fetch_reads(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, tmp.in->header, reads, strand);
 | 
| 
 | 
    85 	//fprintf(stdout, "intron_list: %d \n", intron_list->size());
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 	bam_plbuf_push(0, buf); // finalize pileup
 | 
| 
 | 
    88 	bam_index_destroy(idx);
 | 
| 
 | 
    89 	bam_plbuf_destroy(buf);
 | 
| 
 | 
    90 	samclose(tmp.in);
 | 
| 
 | 
    91 	return 0;
 | 
| 
 | 
    92 }
 | 
| 
 | 
    93 
 | 
| 
 | 
    94 
 | 
| 
 | 
    95 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand)
 | 
| 
 | 
    96 {
 | 
| 
 | 
    97 	int n_off;
 | 
| 
 | 
    98 	pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
 | 
| 
 | 
    99 	if (off == 0) return 0;
 | 
| 
 | 
   100 	{
 | 
| 
 | 
   101 		// retrive alignments
 | 
| 
 | 
   102 		uint64_t curr_off;
 | 
| 
 | 
   103 		int i, ret, n_seeks;
 | 
| 
 | 
   104 		n_seeks = 0; i = -1; curr_off = 0;
 | 
| 
 | 
   105 		bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
 | 
| 
 | 
   106 		for (;;) {
 | 
| 
 | 
   107 			if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
 | 
| 
 | 
   108 				if (i == n_off - 1) break; // no more chunks
 | 
| 
 | 
   109 				if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
 | 
| 
 | 
   110 				if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
 | 
| 
 | 
   111 					bam_seek(fp, off[i+1].u, SEEK_SET);
 | 
| 
 | 
   112 					curr_off = bam_tell(fp);
 | 
| 
 | 
   113 					++n_seeks;
 | 
| 
 | 
   114 				}
 | 
| 
 | 
   115 				++i;
 | 
| 
 | 
   116 			}
 | 
| 
 | 
   117 			if ((ret = bam_read1(fp, b)) > 0) {
 | 
| 
 | 
   118 				curr_off = bam_tell(fp);
 | 
| 
 | 
   119 				if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
 | 
| 
 | 
   120 				else if (is_overlap(beg, end, b)) 
 | 
| 
 | 
   121 				{
 | 
| 
 | 
   122 					int rr = rand();
 | 
| 
 | 
   123 					if ((rr%1000 < subsample))
 | 
| 
 | 
   124 					{
 | 
| 
 | 
   125 						CRead* read = new CRead();
 | 
| 
 | 
   126 						parse_cigar(b, read, header);
 | 
| 
 | 
   127 
 | 
| 
 | 
   128 						if (strand == '0' || strand==read->strand[0])
 | 
| 
 | 
   129 						{
 | 
| 
 | 
   130 							reads->push_back(read);
 | 
| 
 | 
   131 						}
 | 
| 
 | 
   132 						else if (read->strand[0]=='0')
 | 
| 
 | 
   133 						{
 | 
| 
 | 
   134 							reads->push_back(read);
 | 
| 
 | 
   135 						}
 | 
| 
 | 
   136 						//else if (read->strand[0]=='0'&&((b->core.flag & g_flag_off) >0))
 | 
| 
 | 
   137 						//{
 | 
| 
 | 
   138 						//	//fprintf(stdout, "(-)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size());
 | 
| 
 | 
   139 						//	// this flag means that the read has been reversed for alignment
 | 
| 
 | 
   140 						//	// flag bit set and (-)-strand requested
 | 
| 
 | 
   141 						//	reads->push_back(read);
 | 
| 
 | 
   142 						//}  
 | 
| 
 | 
   143 						//else if (read->strand[0]=='0'&&(g_flag_on>0&&(b->core.flag & g_flag_on)==0))
 | 
| 
 | 
   144 						//{
 | 
| 
 | 
   145 						//	//fprintf(stdout, "(+)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size());
 | 
| 
 | 
   146 						//	// (+)-strand requested and flag bit not set
 | 
| 
 | 
   147 						//	reads->push_back(read);
 | 
| 
 | 
   148 						//}  
 | 
| 
 | 
   149 					}
 | 
| 
 | 
   150 				}
 | 
| 
 | 
   151 			} else break; // end of file
 | 
| 
 | 
   152 		}
 | 
| 
 | 
   153 //		fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
 | 
| 
 | 
   154 		bam_destroy1(b);
 | 
| 
 | 
   155 	}
 | 
| 
 | 
   156 	free(off);
 | 
| 
 | 
   157 	return 0;
 | 
| 
 | 
   158 }
 | 
| 
 | 
   159 
 | 
| 
 | 
   160 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header)
 | 
| 
 | 
   161 {
 | 
| 
 | 
   162 	read->start_pos = b->core.pos+1;
 | 
| 
 | 
   163 	read->set_strand('0');
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 	for (int k = 0; k < b->core.n_cigar; ++k) 
 | 
| 
 | 
   166 	{
 | 
| 
 | 
   167 		int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
 | 
| 
 | 
   168 		int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
 | 
| 
 | 
   169 		//fprintf(stdout, "op:%d l:%d\n", op, l);
 | 
| 
 | 
   170 		if (op == BAM_CMATCH) 
 | 
| 
 | 
   171 		{
 | 
| 
 | 
   172 			if (k==0)
 | 
| 
 | 
   173 			{
 | 
| 
 | 
   174 				read->block_lengths.push_back(l);
 | 
| 
 | 
   175 				read->block_starts.push_back(0);
 | 
| 
 | 
   176 			}
 | 
| 
 | 
   177 			else
 | 
| 
 | 
   178 			{
 | 
| 
 | 
   179 				int op_prev = bam1_cigar(b)[k-1] & BAM_CIGAR_MASK; 
 | 
| 
 | 
   180 				int l_prev = bam1_cigar(b)[k-1] >> BAM_CIGAR_SHIFT;
 | 
| 
 | 
   181 				if (op_prev==BAM_CREF_SKIP)// intron before
 | 
| 
 | 
   182 				{
 | 
| 
 | 
   183 					if (read->block_lengths.size()>=1)
 | 
| 
 | 
   184 					{
 | 
| 
 | 
   185 						int last_block_start = (*(read->block_starts.end()-1));
 | 
| 
 | 
   186 						int intron_start = last_block_start+(*(read->block_lengths.end()-1));
 | 
| 
 | 
   187 						read->block_lengths.push_back(l);
 | 
| 
 | 
   188 						read->block_starts.push_back(intron_start+l_prev);
 | 
| 
 | 
   189 					}
 | 
| 
 | 
   190 					else
 | 
| 
 | 
   191 					{
 | 
| 
 | 
   192 						// start of first block was not a match
 | 
| 
 | 
   193 						read->block_lengths.push_back(l);
 | 
| 
 | 
   194 						read->block_starts.push_back(0);
 | 
| 
 | 
   195 					}
 | 
| 
 | 
   196 				}
 | 
| 
 | 
   197 				else
 | 
| 
 | 
   198 				{
 | 
| 
 | 
   199 					if (read->block_lengths.size()>=1 && op == BAM_CDEL)// if it is an insertion then the matching block is not inreased
 | 
| 
 | 
   200 						(*(read->block_lengths.end()-1))+=l;
 | 
| 
 | 
   201 					else
 | 
| 
 | 
   202 					{
 | 
| 
 | 
   203 						//char *samline = bam_format1(header, b);
 | 
| 
 | 
   204 						//printf("header: %s \n", samline);
 | 
| 
 | 
   205 					}
 | 
| 
 | 
   206 				}
 | 
| 
 | 
   207 			}
 | 
| 
 | 
   208 		}
 | 
| 
 | 
   209 		else if (op == BAM_CDEL) 
 | 
| 
 | 
   210 		{
 | 
| 
 | 
   211 			if (k>0 && read->block_lengths.size()>=1)
 | 
| 
 | 
   212 				(*(read->block_lengths.end()-1))+=l;
 | 
| 
 | 
   213 		} 
 | 
| 
 | 
   214 		else if (op == BAM_CREF_SKIP)//intron
 | 
| 
 | 
   215 		{}
 | 
| 
 | 
   216 		else if (op == BAM_CINS || op == BAM_CSOFT_CLIP)
 | 
| 
 | 
   217 		{}
 | 
| 
 | 
   218 	}
 | 
| 
 | 
   219 	// parse auxiliary data
 | 
| 
 | 
   220     uint8_t* s = bam1_aux(b);
 | 
| 
 | 
   221 	uint8_t* end = b->data + b->data_len; 
 | 
| 
 | 
   222 	while (s < end) 
 | 
| 
 | 
   223 	{
 | 
| 
 | 
   224 		 uint8_t type, key[2];
 | 
| 
 | 
   225 		 key[0] = s[0]; key[1] = s[1];
 | 
| 
 | 
   226 		 s += 2; type = *s; ++s;
 | 
| 
 | 
   227 		 //fprintf(stdout, "\n%c%c:%c\n", key[0], key[1], type);
 | 
| 
 | 
   228 		 if (type == 'A')
 | 
| 
 | 
   229 		 {
 | 
| 
 | 
   230 			if ( key[0] =='X' && key[1] == 'S')
 | 
| 
 | 
   231 			{
 | 
| 
 | 
   232 				read->set_strand((char) *s);
 | 
| 
 | 
   233 			}
 | 
| 
 | 
   234 		 	++s;
 | 
| 
 | 
   235 		 }
 | 
| 
 | 
   236 		else if (type == 'C')
 | 
| 
 | 
   237 		{ 
 | 
| 
 | 
   238 			if ( key[0] =='H' && key[1] == '0')
 | 
| 
 | 
   239 			{
 | 
| 
 | 
   240 				uint8_t matches = *s;
 | 
| 
 | 
   241 				read->matches = (int) matches;
 | 
| 
 | 
   242 			}
 | 
| 
 | 
   243 			if ( key[0] =='N' && key[1] == 'M')
 | 
| 
 | 
   244 			{
 | 
| 
 | 
   245 				uint8_t mismatches = *s;
 | 
| 
 | 
   246 				read->mismatches = (int) mismatches;
 | 
| 
 | 
   247 			}
 | 
| 
 | 
   248 			if ( key[0] =='H' && key[1] == 'I')
 | 
| 
 | 
   249 			{
 | 
| 
 | 
   250 				uint8_t mai = *s;
 | 
| 
 | 
   251 				read->multiple_alignment_index = (int) mai;
 | 
| 
 | 
   252 			}
 | 
| 
 | 
   253 
 | 
| 
 | 
   254 			++s;
 | 
| 
 | 
   255 		}
 | 
| 
 | 
   256 		else if (type == 'c') { ++s; }
 | 
| 
 | 
   257 		else if (type == 'S') { s += 2;	}
 | 
| 
 | 
   258 		else if (type == 's') { s += 2;	}
 | 
| 
 | 
   259 		else if (type == 'I') { s += 4; }
 | 
| 
 | 
   260 		else if (type == 'i') { s += 4; }
 | 
| 
 | 
   261 		else if (type == 'f') { s += 4;	}
 | 
| 
 | 
   262 		else if (type == 'd') { s += 8;	}
 | 
| 
 | 
   263 		else if (type == 'Z') { ++s; }
 | 
| 
 | 
   264 		else if (type == 'H') { ++s; }
 | 
| 
 | 
   265 	}
 | 
| 
 | 
   266 }
 | 
| 
 | 
   267 
 | 
| 
 | 
   268 int set_strand(char c)
 | 
| 
 | 
   269 {
 | 
| 
 | 
   270 	if (c=='+')
 | 
| 
 | 
   271 	{
 | 
| 
 | 
   272 		char* fl = (char*) "0x0010";
 | 
| 
 | 
   273 		g_flag_on = strtol(fl, 0, 0);
 | 
| 
 | 
   274 		g_flag_off = 0;
 | 
| 
 | 
   275 	}
 | 
| 
 | 
   276 	else if (c=='-')
 | 
| 
 | 
   277 	{
 | 
| 
 | 
   278 		char* fl = (char*) "0x0010";
 | 
| 
 | 
   279 		g_flag_off = strtol(fl, 0, 0);
 | 
| 
 | 
   280 		g_flag_on = 0;
 | 
| 
 | 
   281 	}
 | 
| 
 | 
   282 }
 | 
| 
 | 
   283 
 |