0
|
1 /*
|
|
2 * This program is free software; you can redistribute it and/or modify
|
|
3 * it under the terms of the GNU General Public License as published by
|
|
4 * the Free Software Foundation; either version 3 of the License, or
|
|
5 * (at your option) any later version.
|
|
6 *
|
|
7 * Written (W) 2009-2011 Regina Bohnert
|
|
8 * Copyright (C) 2009-2011 Max Planck Society
|
|
9 */
|
|
10
|
|
11
|
|
12 #include <stdio.h>
|
|
13 #include <stdlib.h>
|
|
14 #include <signal.h>
|
|
15 #include <ctype.h>
|
|
16 #include <assert.h>
|
|
17
|
|
18 #include <vector>
|
|
19 using std::vector;
|
|
20 #include <string>
|
|
21 using std::string;
|
|
22 #include <algorithm>
|
|
23 using std::find;
|
|
24 using std::min;
|
|
25
|
|
26 #include <mex.h>
|
|
27
|
|
28
|
|
29 char *get_string(const mxArray *prhs);
|
|
30
|
|
31 typedef unsigned int uint32_t;
|
|
32 typedef unsigned char uint8_t;
|
|
33
|
|
34 /*
|
|
35 * [read_len num_reads] = get_bam_properties(fname, path_samtools, contig_name)
|
|
36 *
|
|
37 * -- input --
|
|
38 * prhs[0] file name of paired reads in BAM format (sorted by read id)
|
|
39 * prhs[1] path to samtools
|
|
40 * prhs[2] contig name
|
|
41 *
|
|
42 * -- output --
|
|
43 * plhs[0] length of read
|
|
44 * plhs[1] number of unique reads
|
|
45 */
|
|
46 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
|
|
47 // checks for the right number of arguments
|
|
48 if (nrhs !=3 || nlhs > 2) {
|
|
49 mexErrMsgTxt("number of input and output args should be 3 and 2\nUSAGE:\n [read_len, num_reads] = get_bam_properties(fname, path_samtools, contig_name)\n");
|
|
50 return;
|
|
51 }
|
|
52
|
|
53 signal(SIGCHLD, SIG_IGN); // avoid zombies
|
|
54
|
|
55 // read input arguments
|
|
56 char *fname = get_string(prhs[0]);
|
|
57 char *path_samtools = get_string(prhs[1]);
|
|
58 char *contig_name = get_string(prhs[2]);
|
|
59 char command[10000];
|
|
60
|
|
61 sprintf(command, "%s./samtools view %s %s 2>/dev/null", path_samtools, fname, contig_name);
|
|
62 //printf("%s\n", command);
|
|
63
|
|
64 // get number of unique reads
|
|
65 int status;
|
|
66 uint32_t num_unique_reads = 0;
|
|
67 char command2[10000];
|
|
68 sprintf(command2, "%s | cut -f 1 | sort -u | wc -l", command);
|
|
69 FILE* fp = popen(command2, "r");
|
|
70 if (fp == NULL) {
|
|
71 mexErrMsgTxt("Error using popen\n");
|
|
72 }
|
|
73 int num_scans = 1;
|
|
74 num_scans = fscanf(fp, "%d", &num_unique_reads);
|
|
75 if (num_scans != 1) {
|
|
76 rewind(fp);
|
|
77 char ret[1000];
|
|
78 fgets(ret, 1000, fp);
|
|
79 fprintf(stdout, "%s", ret);
|
|
80 mexErrMsgTxt("Could not determine number of reads\n");
|
|
81 }
|
|
82 status = pclose(fp);
|
|
83 //printf("%i", num_unique_reads);
|
|
84
|
|
85 // select reads for given positions and strand
|
|
86 int num_rows_selected = min((int) num_unique_reads, 100);
|
|
87 sprintf(command, "%s | head -n %i | cut -f 1-11", command, num_rows_selected);
|
|
88 fp = popen(command, "r");
|
|
89 if (fp == NULL) {
|
|
90 mexErrMsgTxt("Error using popen\n");
|
|
91 }
|
|
92 /* SAM format
|
|
93 1: read id, 2: flag, 3: reference name, 4: start (1-based, incl.), 5: mapping quality,
|
|
94 6: CIGAR, 7: mate reference name, 8: mate start (1-based, incl.), 9: insert size, 10: read, 11: quality
|
|
95 12+: additional tags
|
|
96 */
|
|
97 uint32_t read_idx = 0, row_idx = 0, num_col = 0;
|
|
98 uint32_t flag = 0, start_pos = 0, map_score = 0, mate_end_pos = 0, num_matches = 0, num_del = 0, num_ins = 0, ins_size = 0;
|
|
99 char ri [1000], read_contig_name [1000], cg [1000], mate_read_id [1000], read [1000], read_qual [1000];
|
|
100 string last_read_id;
|
|
101 vector<uint32_t> block_lengths, block_starts;
|
|
102 vector<string> read_ids;
|
|
103 vector<string>::iterator it;
|
|
104
|
|
105 uint32_t read_len = 0;
|
|
106 bool empty_line = true;
|
|
107 int num_rows = 0;
|
|
108 while(empty_line && num_rows < num_rows_selected) {
|
|
109 num_col = fscanf(fp, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s", &ri, &flag, &read_contig_name, &start_pos, &map_score, &cg, &mate_read_id, &mate_end_pos, &ins_size, &read, &read_qual);
|
|
110 if (num_col != 11) {
|
|
111 mexErrMsgTxt("error reading SAM line\n");
|
|
112 }
|
|
113
|
|
114 string cigar = (string) cg;
|
|
115 // ignore lines with reads w/o mapping information
|
|
116 if (start_pos == 0 || cigar.compare("*")==0) {
|
|
117 continue;
|
|
118 }
|
|
119 // parse CIGAR
|
|
120 uint last_c = 0;
|
|
121 string last_str;
|
|
122 num_matches = 0;
|
|
123 char *end = NULL;
|
|
124 uint32_t tmp_nm = 0, tmp_nd = 0, tmp_ni = 0;
|
|
125 uint32_t last_block_start = 0, last_block_length = 0, last_intron_len = 0;
|
|
126 block_lengths.clear(); block_starts.clear();
|
|
127
|
|
128 for (uint c = 0; c < cigar.size(); c++) {
|
|
129 switch (cigar[c]) {
|
|
130 case 'M':
|
|
131 last_str = cigar.substr(last_c, c-last_c);
|
|
132 tmp_nm = strtoul(last_str.c_str(), &end, 10);
|
|
133 if (*end != '\0')
|
|
134 mexErrMsgTxt("error: number of mismatches\n");
|
|
135 end = NULL;
|
|
136 last_block_length += tmp_nm;
|
|
137 num_matches += tmp_nm;
|
|
138 last_c = c + 1;
|
|
139 break;
|
|
140 case 'I':
|
|
141 last_str = cigar.substr(last_c, c-last_c);
|
|
142 tmp_ni = strtoul(last_str.c_str(), &end, 10);
|
|
143 if (*end != '\0')
|
|
144 mexErrMsgTxt("error: number of insertions\n");
|
|
145 end = NULL;
|
|
146 num_ins += tmp_ni;
|
|
147 last_c = c + 1;
|
|
148 break;
|
|
149 case 'D':
|
|
150 last_str = cigar.substr(last_c, c-last_c);
|
|
151 tmp_nd = strtoul(last_str.c_str(), &end, 10);
|
|
152 if (*end != '\0')
|
|
153 mexErrMsgTxt("error: number of deletions\n");
|
|
154 end = NULL;
|
|
155 num_del += tmp_nd;
|
|
156 last_block_length += tmp_nd;
|
|
157 last_c = c + 1;
|
|
158 break;
|
|
159 case 'N':
|
|
160 last_str = cigar.substr(last_c, c-last_c);
|
|
161 last_intron_len = strtoul(last_str.c_str(), &end, 10);
|
|
162 end = NULL;
|
|
163 last_c = c + 1;
|
|
164 break;
|
|
165 case 'S':
|
|
166 break;
|
|
167 case 'H':
|
|
168 break;
|
|
169 case 'P':
|
|
170 break;
|
|
171 default:
|
|
172 break;
|
|
173 }
|
|
174 if (cigar[c] == 'N' || c==cigar.size()-1) {
|
|
175 block_starts.push_back(last_block_start);
|
|
176 last_block_start = last_block_start + last_block_length + last_intron_len;
|
|
177 last_intron_len = 0;
|
|
178 block_lengths.push_back(last_block_length);
|
|
179 last_block_length = 0;
|
|
180 }
|
|
181 }
|
|
182 read_len = 0;
|
|
183 for (uint n = 0; n < block_lengths.size(); n++) {
|
|
184 read_len += block_lengths[n];
|
|
185 }
|
|
186 empty_line = false;
|
|
187 } // end of stream parsing
|
|
188
|
|
189 status = pclose(fp);
|
|
190
|
|
191 if (empty_line)
|
|
192 mexErrMsgTxt("Could not determine read length\n");
|
|
193
|
|
194 plhs[0] = mxCreateDoubleScalar((double) read_len);
|
|
195 plhs[1] = mxCreateDoubleScalar((double) num_unique_reads);
|
|
196
|
|
197 return;
|
|
198 }
|
|
199
|
|
200
|
|
201 char *get_string(const mxArray *prhs) {
|
|
202 char *buf;
|
|
203 int buflen;
|
|
204 if (!prhs)
|
|
205 mexErrMsgTxt("get_string called with NULL pointer arg");
|
|
206 if (!mxIsChar(prhs))
|
|
207 mexErrMsgTxt("input is not a string");
|
|
208 if (mxGetM(prhs) != 1)
|
|
209 mexErrMsgTxt("input is not a row vector");
|
|
210 buflen = mxGetN(prhs) + 1;
|
|
211 buf = (char*) malloc(buflen);
|
|
212 /* copy the string from prhs into buf and add terminating NULL char */
|
|
213 if (mxGetString(prhs, buf, buflen))
|
|
214 mexErrMsgTxt("not enough space");
|
|
215 return buf;
|
|
216 }
|