| 0 | 1 /* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */ | 
|  | 2 | 
|  | 3 #include <stdio.h> | 
|  | 4 #include <string.h> | 
|  | 5 #include <signal.h> | 
|  | 6 #include <mex.h> | 
|  | 7 #include <algorithm> | 
|  | 8 #include <vector> | 
|  | 9 	using std::vector; | 
|  | 10 #include "get_reads_direct.h" | 
|  | 11 #include "mex_input.h" | 
|  | 12 #include "read.h" | 
|  | 13 | 
|  | 14 #define MAXLINE 10000 | 
|  | 15 | 
|  | 16 /* | 
|  | 17  * input: | 
|  | 18  * 1 bam file | 
|  | 19  * 2 chromosome | 
|  | 20  * 3 region start (1-based index) | 
|  | 21  * 4 region end (1-based index) | 
|  | 22  * 5 strand (either '+' or '-' or '0') | 
|  | 23  * [6] collapse flag: if true the reads are collapsed to a coverage track | 
|  | 24  * [7] subsample percentage: percentage of reads to be subsampled (in per mill) | 
|  | 25  * [8] intron length filter | 
|  | 26  * [9] exon length filter | 
|  | 27  * [10] mismatch filter | 
|  | 28  * [11] bool: use mapped reads for coverage | 
|  | 29  * [12] bool: use spliced reads for coverage | 
|  | 30  * [13] return maxminlen | 
|  | 31  * [14] return pair coverage and pair index list | 
|  | 32  * [15] only_clipped | 
|  | 33  * [15] switch of pair filter | 
|  | 34  * | 
|  | 35  * output: | 
|  | 36  * 1 coverage | 
|  | 37  * [2] intron cell array | 
|  | 38  * [3] pair coverage | 
|  | 39  * [4] pair list | 
|  | 40  * | 
|  | 41  * example call: | 
|  | 42  * [cov introns] = get_reads('polyA_left_I+_el15_mm1_spliced.bam', 'I', 10000, 12000, '-', 1, 30); | 
|  | 43  */ | 
|  | 44 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { | 
|  | 45 | 
|  | 46 	if (nrhs<5 || nrhs>16 || (nlhs<1 || nlhs>4)) { | 
|  | 47 		fprintf(stderr, "usage: [x [introns] [pair]] = get_reads(fname, chr, start, end, strand, [collapse], [subsample], [max intron length], [min exonlength], [max mismatches], [mapped], [spliced], [maxminlen], [pair], [only clipped], [all pairs]);\n"); | 
|  | 48 		return; | 
|  | 49 	} | 
|  | 50 | 
|  | 51 	/* obligatory arguments | 
|  | 52 	 * **********************/ | 
|  | 53 	char *fname = get_string(prhs[0]); | 
|  | 54 	//fprintf(stdout, "arg1: %s\n", fname); | 
|  | 55 	char *chr = get_string(prhs[1]); | 
|  | 56 	//fprintf(stdout, "arg2: %s\n", chr); | 
|  | 57 	int from_pos = get_int(prhs[2]); | 
|  | 58 	//fprintf(stdout, "arg3: %d\n", from_pos); | 
|  | 59 	int to_pos = get_int(prhs[3]); | 
|  | 60 	//fprintf(stdout, "arg4: %d\n", to_pos); | 
|  | 61 	char *strand = get_string(prhs[4]); | 
|  | 62 	//fprintf(stdout, "arg5: %s\n", strand); | 
|  | 63 | 
|  | 64 	if (from_pos>to_pos) | 
|  | 65 		 mexErrMsgTxt("Start (arg 3) must be <= end (arg 4)\n"); | 
|  | 66 | 
|  | 67 	if (strand[0]!='+' && strand[0]!='-' && strand[0]!='0') | 
|  | 68 		mexErrMsgTxt("Unknown strand (arg 5): either + or - or 0"); | 
|  | 69 | 
|  | 70 	/* optional arguments | 
|  | 71 	 * ******************/ | 
|  | 72 	int collapse = 0; | 
|  | 73 	if (nrhs>=6) | 
|  | 74 		collapse = get_int(prhs[5]); | 
|  | 75 | 
|  | 76 	int subsample = 1000; | 
|  | 77 	if (nrhs>=7) | 
|  | 78 		subsample = get_int(prhs[6]); | 
|  | 79 | 
|  | 80 	int intron_len_filter = 1e9; | 
|  | 81 	if (nrhs>=8) | 
|  | 82 		intron_len_filter = get_int(prhs[7]); | 
|  | 83 | 
|  | 84 	int exon_len_filter = -1; | 
|  | 85 	if (nrhs>=9) | 
|  | 86 		exon_len_filter = get_int(prhs[8]); | 
|  | 87 | 
|  | 88 	int filter_mismatch = 1e9; | 
|  | 89 	if (nrhs>=10) | 
|  | 90 		filter_mismatch = get_int(prhs[9]); | 
|  | 91 | 
|  | 92 	int mapped = 1; | 
|  | 93 	if (nrhs>=11) | 
|  | 94 		mapped = get_int(prhs[10]); | 
|  | 95 | 
|  | 96 	int spliced = 1; | 
|  | 97 	if (nrhs>=12) | 
|  | 98 		spliced = get_int(prhs[11]); | 
|  | 99 | 
|  | 100 	int maxminlen = 0; | 
|  | 101 	if (nrhs>=13) | 
|  | 102 		maxminlen = get_int(prhs[12]); | 
|  | 103 | 
|  | 104 	int pair_cov = 0; | 
|  | 105 	if (nrhs>=14) | 
|  | 106 		pair_cov = get_int(prhs[13]); | 
|  | 107 | 
|  | 108 	int only_clipped = 0; | 
|  | 109 	if (nrhs>=15) | 
|  | 110 		only_clipped = get_int(prhs[14]); | 
|  | 111 | 
|  | 112 	int no_pair_filter = 0; | 
|  | 113 	if (nrhs>=16) | 
|  | 114 		no_pair_filter = get_int(prhs[15]); | 
|  | 115 | 
|  | 116 | 
|  | 117 	/* call function to get reads | 
|  | 118 	 * **************************/ | 
|  | 119 	char region[MAXLINE]; | 
|  | 120 	sprintf(region, "%s:%i-%i", chr, from_pos, to_pos); | 
|  | 121 | 
|  | 122 	vector<CRead*> all_reads; | 
|  | 123 | 
|  | 124 	get_reads_from_bam(fname, region, &all_reads, strand[0], subsample); | 
|  | 125 | 
|  | 126 	for (int i=0; i<all_reads.size(); i++) { | 
|  | 127 		all_reads[i]->strip_leftright_tag() ; | 
|  | 128 	} | 
|  | 129 | 
|  | 130 	/* filter reads | 
|  | 131 	 * **************/ | 
|  | 132 	int left = 0; | 
|  | 133 	int right = 0; | 
|  | 134 | 
|  | 135 	vector<CRead*> reads; | 
|  | 136 	for (int i=0; i<all_reads.size(); i++) { | 
|  | 137 		if (all_reads[i]->left) | 
|  | 138 			left++; | 
|  | 139 		if (all_reads[i]->right) | 
|  | 140 			right++; | 
|  | 141 		//if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch && all_reads[i]->multiple_alignment_index==0) | 
|  | 142 		if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch && (only_clipped==0 || all_reads[i]->is_clipped)) | 
|  | 143 			reads.push_back(all_reads[i]); | 
|  | 144 	} | 
|  | 145 | 
|  | 146 | 
|  | 147 	/* prepare output | 
|  | 148 	 * **************/ | 
|  | 149 	int num_rows = reads.size(); | 
|  | 150 	int num_pos = to_pos-from_pos+1; | 
|  | 151 | 
|  | 152 	if (pair_cov==1 && nlhs>=3) { | 
|  | 153 		// sort reads by read_id | 
|  | 154 		//printf("\n\nleft:%i right:%i \n\n", left, right); | 
|  | 155 		//printf("\nreads[0]->read_id: %s\n", reads[0]->read_id); | 
|  | 156 		sort(reads.begin(), reads.end(), CRead::compare_by_read_id); | 
|  | 157 	} | 
|  | 158 | 
|  | 159 	// read coverages collapsed | 
|  | 160 	if (collapse) { | 
|  | 161 		plhs[0] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL); | 
|  | 162 		uint32_t *mask_ret = (uint32_t*) mxGetData(plhs[0]); | 
|  | 163 		if (num_pos>0 && mask_ret==NULL) | 
|  | 164 			mexErrMsgTxt("Error allocating memory\n"); | 
|  | 165 		if (mapped && spliced) { | 
|  | 166 			for (int i=0; i<reads.size(); i++) { | 
|  | 167 				reads[i]->get_coverage(from_pos, to_pos, mask_ret); | 
|  | 168 			} | 
|  | 169 		} else { | 
|  | 170 			for (int i=0; i<reads.size(); i++) { | 
|  | 171 				ssize_t num_exons = reads[i]->block_starts.size(); | 
|  | 172 				if ((num_exons==1 && mapped) || (num_exons>1 && spliced)) | 
|  | 173 					reads[i]->get_coverage(from_pos, to_pos, mask_ret); | 
|  | 174 			} | 
|  | 175 		} | 
|  | 176 	} | 
|  | 177 	// reads not collapsed | 
|  | 178 	else { | 
|  | 179 		uint32_t nzmax = 0; // maximal number of nonzero elements | 
|  | 180 		int len = to_pos-from_pos+1; | 
|  | 181 		for (uint i=0; i<reads.size(); i++) | 
|  | 182 		{ | 
|  | 183 			ssize_t num_exons = reads[i]->block_starts.size(); | 
|  | 184 			if (!((mapped && spliced) || (num_exons==1 && mapped) || (num_exons>1 && spliced))) | 
|  | 185 			{ | 
|  | 186 				continue; | 
|  | 187 			} | 
|  | 188 			for (uint n = 0; n < reads[i]->block_starts.size(); n++) | 
|  | 189 			{ | 
|  | 190 				uint32_t from, to; | 
|  | 191 				if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos >= 0) | 
|  | 192 					from = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos; | 
|  | 193 				else | 
|  | 194 					from = 0; | 
|  | 195 				if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n] >= 0) | 
|  | 196 					to = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n]; | 
|  | 197 				else | 
|  | 198 					to = 0; | 
|  | 199 				for (int bp=from; bp<to&bp<len; bp++) | 
|  | 200 				{ | 
|  | 201 					nzmax++; | 
|  | 202 				} | 
|  | 203 			} | 
|  | 204 		} | 
|  | 205 		// 1st row: row indices | 
|  | 206 		// 2nd row: column indices | 
|  | 207 		plhs[0] = mxCreateDoubleMatrix(2, nzmax, mxREAL); | 
|  | 208 		double *mask_ret = (double*) mxGetData(plhs[0]); | 
|  | 209 		if (nzmax>0 && mask_ret==NULL) | 
|  | 210 			mexErrMsgTxt("Error allocating memory\n"); | 
|  | 211 		uint32_t mask_ret_c = 0; // counter | 
|  | 212 		for (uint i=0; i<reads.size(); i++) | 
|  | 213 		{ | 
|  | 214 			ssize_t num_exons = reads[i]->block_starts.size(); | 
|  | 215 			if (!((mapped && spliced) || (num_exons==1 && mapped) || (num_exons>1 && spliced))) | 
|  | 216 			{ | 
|  | 217 				continue; | 
|  | 218 			} | 
|  | 219 			reads[i]->get_reads_sparse(from_pos, to_pos, mask_ret, mask_ret_c, i); | 
|  | 220 		} | 
|  | 221 		if (mask_ret_c!=2*nzmax) | 
|  | 222 			mexErrMsgTxt("Error filling index arrays for sparse matrix\n"); | 
|  | 223 	} | 
|  | 224 	// introns | 
|  | 225 	if (maxminlen==0 && nlhs>=2) { | 
|  | 226 			vector<int> intron_list; | 
|  | 227 			for (int i=0; i<reads.size(); i++) { | 
|  | 228 				reads[i]->get_introns(&intron_list); | 
|  | 229 			} | 
|  | 230 | 
|  | 231 			plhs[1] = mxCreateNumericMatrix(2, intron_list.size()/2, mxUINT32_CLASS, mxREAL); | 
|  | 232 			uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]); | 
|  | 233 			for (int p = 0; p<intron_list.size(); p++) { | 
|  | 234 				p_intron_list[p] = intron_list[p]; | 
|  | 235 			} | 
|  | 236 			intron_list.clear(); | 
|  | 237 		} else if (nlhs>=2) { | 
|  | 238 			vector<uint32_t> intron_starts; | 
|  | 239 			vector<uint32_t> intron_ends; | 
|  | 240 			vector<uint32_t> block_len1; | 
|  | 241 			vector<uint32_t> block_len2; | 
|  | 242 			for (int i=0; i<reads.size(); i++) { | 
|  | 243 				reads[i]->get_introns(&intron_starts, &intron_ends, &block_len1, &block_len2); | 
|  | 244 			} | 
|  | 245 | 
|  | 246 			plhs[1] = mxCreateNumericMatrix(4, intron_starts.size(), mxINT32_CLASS, mxREAL); | 
|  | 247 			uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]); | 
|  | 248 			for (int p = 0; p<intron_starts.size(); p++) { | 
|  | 249 				p_intron_list[4*p] = intron_starts[p]; | 
|  | 250 				p_intron_list[(4*p)+1] = intron_ends[p]; | 
|  | 251 				p_intron_list[(4*p)+2] = block_len1[p]; | 
|  | 252 				p_intron_list[(4*p)+3] = block_len2[p]; | 
|  | 253 			} | 
|  | 254 			intron_starts.clear() ; | 
|  | 255 			intron_ends.clear() ; | 
|  | 256 			block_len1.clear() ; | 
|  | 257 			block_len2.clear() ; | 
|  | 258 	} | 
|  | 259 	if (pair_cov==1 && nlhs>=3) { | 
|  | 260 		plhs[2] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL); | 
|  | 261 		uint32_t *p_pair_map = (uint32_t*) mxGetData(plhs[2]); | 
|  | 262 		if (num_pos>0 && p_pair_map==NULL) | 
|  | 263 			mexErrMsgTxt("Error allocating memory\n"); | 
|  | 264 | 
|  | 265 		vector<int> pair_ids; | 
|  | 266 | 
|  | 267 		int take_cnt = 0; | 
|  | 268 		int discard_cnt = 0; | 
|  | 269 		int discard_strand_cnt = 0; | 
|  | 270 		//printf("reads.size(): %i\n", reads.size()); | 
|  | 271 		// find consecutive reads with the same id | 
|  | 272 		for (int i=0; i<((int) reads.size())-1; i++) | 
|  | 273 		{ | 
|  | 274 			//printf("reads[%i]->read_id: %s\n", i, reads[i]->read_id); | 
|  | 275 			int j = i+1; | 
|  | 276 			while(j<reads.size() && strcmp(reads[i]->read_id, reads[j]->read_id) == 0) | 
|  | 277 			{ | 
|  | 278 				if ((reads[i]->left && reads[j]->right) || (reads[j]->left && reads[i]->right) && (reads[i]->reverse != reads[j]->reverse)) | 
|  | 279 				{ | 
|  | 280 					if (reads[i]->get_last_position()==-1 || reads[j]->get_last_position()==-1) | 
|  | 281 						break; | 
|  | 282 					if (false)//(reads[i]->strand[0]=='0' && reads[j]->strand[0]=='0' ) | 
|  | 283 					{ | 
|  | 284 						// discard pairs without strand information | 
|  | 285 						discard_strand_cnt++; | 
|  | 286 					} | 
|  | 287 					else if (reads[i]->get_last_position()<reads[j]->start_pos && reads[j]->start_pos-reads[i]->get_last_position()<60000) | 
|  | 288 					{ | 
|  | 289 						int from = std::max(0, reads[i]->get_last_position()-from_pos); | 
|  | 290 						int to = std::min(num_pos-1, reads[j]->start_pos-from_pos); | 
|  | 291 						pair_ids.push_back(i); | 
|  | 292 						pair_ids.push_back(j); | 
|  | 293 						for (int k=from; k<to; k++) | 
|  | 294 							p_pair_map[k]++; | 
|  | 295 						take_cnt++; | 
|  | 296 					} | 
|  | 297 					else if (reads[i]->start_pos>reads[j]->get_last_position() && reads[j]->get_last_position()-reads[i]->start_pos<60000) | 
|  | 298 					{ | 
|  | 299 						int from = std::max(0, reads[j]->get_last_position()-from_pos); | 
|  | 300 						int to = std::min(num_pos-1, reads[i]->start_pos-from_pos); | 
|  | 301 						pair_ids.push_back(i); | 
|  | 302 						pair_ids.push_back(j); | 
|  | 303 						for (int k=from; k<to; k++) | 
|  | 304 							p_pair_map[k]++; | 
|  | 305 						take_cnt++; | 
|  | 306 					} | 
|  | 307 					else | 
|  | 308 					{ | 
|  | 309 						if (no_pair_filter>0 && reads[i]->start_pos<reads[j]->start_pos) | 
|  | 310 						{ | 
|  | 311 							pair_ids.push_back(i); | 
|  | 312 							pair_ids.push_back(j); | 
|  | 313 							take_cnt++; | 
|  | 314 						} | 
|  | 315 						else if (no_pair_filter>0) | 
|  | 316 						{ | 
|  | 317 							pair_ids.push_back(j); | 
|  | 318 							pair_ids.push_back(i); | 
|  | 319 							take_cnt++; | 
|  | 320 						} | 
|  | 321 						else | 
|  | 322 							discard_cnt++; | 
|  | 323 						//printf("istart:%i, ilast:%i  jstart:%i, jlast: %i\n", reads[i]->start_pos, reads[i]->get_last_position(), reads[j]->start_pos, reads[j]->get_last_position()); | 
|  | 324 					} | 
|  | 325 				} | 
|  | 326 				else | 
|  | 327 					discard_cnt++; | 
|  | 328 				j++; | 
|  | 329 			} | 
|  | 330 		} | 
|  | 331 		//printf("take:%i, discard:%i discard_strand_cnt:%i\n", take_cnt, discard_cnt+discard_strand_cnt, discard_strand_cnt); | 
|  | 332 | 
|  | 333 		if (nlhs>=4) { | 
|  | 334 			plhs[3] = mxCreateNumericMatrix(2, pair_ids.size()/2, mxUINT32_CLASS, mxREAL); | 
|  | 335 			uint32_t *pair_ids_ret = (uint32_t*) mxGetData(plhs[3]); | 
|  | 336 			if (pair_ids.size()>0 && pair_ids_ret==NULL) | 
|  | 337 				mexErrMsgTxt("Error allocating memory\n"); | 
|  | 338 			for (int i=0; i<pair_ids.size(); i++) { | 
|  | 339 				pair_ids_ret[i] = pair_ids[i]; | 
|  | 340 			} | 
|  | 341 		} | 
|  | 342 	} | 
|  | 343 	for (int i=0; i<all_reads.size(); i++) | 
|  | 344 		delete all_reads[i]; | 
|  | 345 } | 
|  | 346 |