1
|
1 //Author: Jie Wu@CSHL
|
|
2 //TO replace the original Perl code
|
|
3
|
|
4 #include <stdio.h>
|
|
5 #include <stdlib.h>
|
|
6 #include <iostream>
|
|
7 #include <fstream>
|
|
8 #include <string.h>
|
|
9 #include <unistd.h>
|
|
10 #include <sstream>
|
|
11 #include <map>
|
|
12 #include <vector>
|
|
13 #include <math.h>
|
|
14 #include <time.h>
|
|
15 #include <limits>
|
|
16 #include <iomanip>
|
|
17
|
|
18 using namespace std;
|
|
19
|
|
20 int MAX_LINE_LEN = 1024*1024;
|
|
21
|
|
22 void printusage()
|
|
23 {
|
|
24 cout<< "\tUsage:"<<endl;
|
|
25 cout<<"\t-s\tread_size"<<endl;
|
|
26 cout<<"\t-b\tIRM file"<<endl;
|
|
27 cout<<"\t-f\tFZM file"<<endl;
|
|
28 cout<<"\t-1\tmapping result 1"<<endl;
|
|
29 cout<<"\t-2\tmapping result 2"<<endl;
|
|
30 cout<<"\t-d\tTXdb database file"<<endl;
|
|
31 cout<<"\t-o\toutput prefix"<<endl;
|
|
32
|
|
33 }
|
|
34
|
|
35 struct ReadPairInfo
|
|
36 {
|
|
37 bool isPaired;
|
|
38 vector<int> IsoFlag; //flag as 1 if mapped to the isoform
|
|
39 vector<int> Size; //fragment size in the isoform
|
|
40 vector<int> Pos;
|
|
41 };
|
|
42
|
|
43 class TXdb_entry{
|
|
44 public:
|
|
45 string chrid;
|
|
46 //int start;
|
|
47 //int end;
|
|
48 string id;
|
|
49 bool iscomplete;
|
|
50 int start;
|
|
51 int end;
|
|
52 int exonstarts[3];
|
|
53 char strand;
|
|
54 int exonsize[3];
|
|
55 int len1;
|
|
56 int len2;
|
|
57 vector< ReadPairInfo > pairs;
|
|
58 int snum1; //single-end reads
|
|
59 int snum2;
|
|
60 int snum12;
|
|
61 float ir, bir;
|
|
62 int enum1; //exon body reads
|
|
63 int enum2;
|
|
64 int enum3;
|
|
65
|
|
66 int jnum12;
|
|
67 int jnum23;
|
|
68 int jnum13;
|
|
69
|
|
70 int totalreadnum;
|
|
71 void estimate(vector<double> &FZM, map<string, vector<double> > &IRM, int &read_size)
|
|
72 {
|
|
73 float Maxe=0;
|
|
74 float Max = -numeric_limits<float>::max();
|
|
75 float BMaxe=0;
|
|
76 float BMax=-numeric_limits<float>::max();
|
|
77 for(float e1=0.001; e1<1;e1=e1+0.001)
|
|
78 {
|
|
79 float efix1 = e1*len1/
|
|
80 (e1*exonsize[1]+len2);
|
|
81 float efix2 = 1-efix1;
|
|
82 float LL = snum12*log( efix1/(len1-read_size+1)
|
|
83 + efix2/(len2-read_size+1) )
|
|
84 + snum1*log(efix1/(len1-read_size+1))
|
|
85 + snum2*log(efix2/(len2-read_size+1));
|
|
86 //cout<<LL<<endl;
|
|
87 //cout<<efix1<<"\t" <<pairs.size()<<"\t" <<len2<<"\t" <<endl;
|
|
88 //int num1=0, num2=0,num12=0;
|
|
89 for(int i=0;i<pairs.size();i++)
|
|
90 {
|
|
91 if(pairs[i].IsoFlag[0] == 1)
|
|
92 {
|
|
93 if(pairs[i].IsoFlag[1] == 0)
|
|
94 {
|
|
95 // num1++;
|
|
96 float tmp=FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1);
|
|
97 if (tmp == 0)
|
|
98 {
|
|
99 LL = LL - 308;
|
|
100 }
|
|
101 else
|
|
102 {
|
|
103 LL = LL + log(tmp);
|
|
104 }
|
|
105 }
|
|
106 else
|
|
107 {
|
|
108 // num12++;
|
|
109 float tmp=(FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1))
|
|
110 + (FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1));
|
|
111 if(tmp ==0)
|
|
112 {
|
|
113 LL = LL -308;
|
|
114 }
|
|
115 else
|
|
116 {
|
|
117 LL = LL + log(tmp);
|
|
118 }
|
|
119 }
|
|
120 }
|
|
121 else
|
|
122 {
|
|
123 //num2++;
|
|
124 float tmp = FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1);
|
|
125 if(tmp ==0)
|
|
126 {
|
|
127 LL= LL - 308;
|
|
128 }
|
|
129 else
|
|
130 {
|
|
131 LL = LL +log(tmp);
|
|
132 }
|
|
133 }
|
|
134 }
|
|
135 //cout<<num1<<"\t"<<num2<<"\t"<<num12<<"\n";
|
|
136 //cout<<LL<<endl;
|
|
137
|
|
138 if(IRM["CA"].size() > 0)
|
|
139 {
|
|
140 float BLL;
|
|
141 string eventtype =id.substr(0,2);
|
|
142 if(!eventtype.compare("ME"))
|
|
143 {
|
|
144 eventtype = "CA";
|
|
145 }
|
|
146 BLL = LL + log(IRM[eventtype][int(e1/0.001)]);
|
|
147 if(BLL > BMax)
|
|
148 {
|
|
149 BMax = BLL;
|
|
150 BMaxe = e1;
|
|
151 }
|
|
152 }
|
|
153 //cout<<LL<<endl;
|
|
154 if(LL > Max)
|
|
155 {
|
|
156 Max= LL;
|
|
157 Maxe = e1;
|
|
158
|
|
159 }
|
|
160
|
|
161 }
|
|
162 ir= Maxe;
|
|
163 bir= BMaxe;
|
|
164 }
|
|
165
|
|
166 TXdb_entry(char *line)
|
|
167 {
|
|
168 size_t found;
|
|
169 string linestr(line);
|
|
170 found = linestr.find("[L]");
|
|
171 if(found != string::npos)
|
|
172 {
|
|
173 iscomplete=1;
|
|
174
|
|
175 char *token;
|
|
176
|
|
177 token = strtok(line,"\t");
|
|
178 if(token==NULL)
|
|
179 {
|
|
180 cout<<"error1"<<endl;
|
|
181 exit(0);
|
|
182 }
|
|
183 else
|
|
184 {
|
|
185 chrid = token;
|
|
186 }
|
|
187 token = strtok(NULL,"\t");
|
|
188 if(token==NULL)
|
|
189 {
|
|
190 cout<<"error2"<<endl;
|
|
191 exit(0);
|
|
192 }
|
|
193 else
|
|
194 {
|
|
195 sscanf(token,"%d",&start);
|
|
196 }
|
|
197 token = strtok(NULL,"\t");
|
|
198 if(token==NULL)
|
|
199 {
|
|
200 cout<<"error3"<<endl;
|
|
201 exit(0);
|
|
202 }
|
|
203 else
|
|
204 {
|
|
205 sscanf(token,"%d",&end);
|
|
206 }
|
|
207 token = strtok(NULL,"\t");
|
|
208 if(token==NULL)
|
|
209 {
|
|
210 cout<<"error4"<<endl;exit(0);
|
|
211 }
|
|
212 else
|
|
213 {
|
|
214 id = token;
|
|
215 id = id.substr(0,id.size()-3);
|
|
216 }
|
|
217 token = strtok(NULL,"\t");
|
|
218 token = strtok(NULL,"\t");
|
|
219 if(token==NULL)
|
|
220 {
|
|
221 cout<<"error5"<<endl;exit(0);
|
|
222 }
|
|
223 else
|
|
224 {
|
|
225 sscanf(token,"%c",&strand);
|
|
226 }
|
|
227 for(int i=0;i<5;i++)
|
|
228 {
|
|
229 token=strtok(NULL,"\t");
|
|
230 }
|
|
231 if(token==NULL){
|
|
232 cout<<"error6"<<endl;exit(0);
|
|
233 }
|
|
234 else
|
|
235 {
|
|
236 string str_tmp(token);
|
|
237 size_t found_tmp;
|
|
238 int *pexonsize = exonsize;
|
|
239 while((found_tmp = str_tmp.find(",")) != string::npos){
|
|
240 sscanf((str_tmp.substr(0,found_tmp)).c_str(),"%d",pexonsize);
|
|
241 pexonsize++;
|
|
242 str_tmp.erase(0,found_tmp+1);
|
|
243 }
|
|
244 sscanf(str_tmp.c_str(),"%d",pexonsize);
|
|
245
|
|
246 }
|
|
247 token=strtok(NULL,"\t");
|
|
248
|
|
249 if(token==NULL){
|
|
250 cout<<"error7"<<endl;exit(0);
|
|
251 }
|
|
252 else
|
|
253 {
|
|
254 string str_tmp(token);
|
|
255 size_t found_tmp;
|
|
256 int *pexonstarts = exonstarts;
|
|
257 while((found_tmp = str_tmp.find(",")) != string::npos){
|
|
258 sscanf((str_tmp.substr(0,found_tmp)).c_str(),"%d",pexonstarts);
|
|
259 pexonstarts++;
|
|
260 str_tmp.erase(0,found_tmp+1);
|
|
261 }
|
|
262 sscanf(str_tmp.c_str(),"%d",pexonstarts);
|
|
263
|
|
264 }
|
|
265 len1 = exonsize[0]+exonsize[1]+exonsize[2];
|
|
266 len2 = exonsize[0]+exonsize[2];
|
|
267 snum12=0;
|
|
268 snum1 =0;
|
|
269 snum2 =0;
|
|
270 enum1=0;
|
|
271 enum2=0;
|
|
272 enum3=0;
|
|
273 jnum12=0;
|
|
274 jnum13=0;
|
|
275 jnum23=0;
|
|
276 totalreadnum=0;
|
|
277 }
|
|
278 else
|
|
279 {
|
|
280 iscomplete = 0;
|
|
281 }
|
|
282 }
|
|
283 };
|
|
284
|
|
285 class ReadInfo
|
|
286 {
|
|
287 public:
|
|
288 bool isUseful;
|
|
289 char *pline;
|
|
290 void scan(map<string, vector<int> > &map_pos )
|
|
291 //eventname, pos in L/S iso
|
|
292 {
|
|
293 char *token;
|
|
294 token = strtok(pline, "\t");
|
|
295 token = strtok(NULL, "\t");
|
|
296 token = strtok(NULL, "\t");
|
|
297 token = strtok(NULL, "\t");
|
|
298 string mapinfo(token);
|
|
299 size_t beg, end;
|
|
300 end = 0;
|
|
301 while( (beg= mapinfo.find_first_of("/", end))!=string::npos)
|
|
302 {
|
|
303 beg ++;
|
|
304 end = mapinfo.find_first_of("[",beg);
|
|
305 string eventid = mapinfo.substr( beg, end-beg ) ;
|
|
306 vector<int> pos(2,-1);
|
|
307 if(!map_pos.count(eventid))
|
|
308 {
|
|
309 map_pos.insert(pair<string, vector<int> >
|
|
310 (eventid, pos) );
|
|
311 }
|
|
312 end ++;
|
|
313 string LorS = mapinfo.substr( end,1 );
|
|
314 beg = mapinfo.find_first_of(":",end );
|
|
315 beg ++;
|
|
316 end = mapinfo.find_first_of(",",beg );
|
|
317 string pos_str(mapinfo.substr(beg, end-2-beg));
|
|
318
|
|
319 if(LorS.compare("L") == 0)
|
|
320 {
|
|
321 sscanf(pos_str.c_str(),"%d",&(map_pos[eventid][0]) );
|
|
322 }
|
|
323 else
|
|
324 {
|
|
325 sscanf(pos_str.c_str(),"%d",&(map_pos[eventid][1]) );
|
|
326 }
|
|
327
|
|
328 //cout<<eventid<<"\t"<<map_pos[eventid][0]<<"\t"<<map_pos[eventid][1]<<endl;
|
|
329 }
|
|
330
|
|
331 }
|
|
332 ReadInfo(char *line)
|
|
333 {
|
|
334 string line_str(line);
|
|
335 string line_str_last2=line_str.substr(line_str.size()-2,2);
|
|
336 if(line_str_last2.compare("NM") == 0 )
|
|
337 {
|
|
338 isUseful = 0;
|
|
339 }
|
|
340 else if(line_str_last2.compare("MT") == 0 )
|
|
341 {
|
|
342 isUseful = 0;
|
|
343 }
|
|
344 else
|
|
345 {
|
|
346 isUseful = 1;
|
|
347 pline = line;
|
|
348 }
|
|
349 }
|
|
350 };
|
|
351
|
|
352
|
|
353
|
|
354 class ReadPair
|
|
355 {
|
|
356 public:
|
|
357 map<string, ReadPairInfo> mappinginfo;
|
|
358 ReadPair(map<string, vector<int> > &read1_mappos, map<string, vector<int> > &read2_mappos, int &read_size)
|
|
359 {
|
|
360 map<string, vector<int> >::iterator it1 = read1_mappos.begin();
|
|
361 for(;it1!=read1_mappos.end();it1++)
|
|
362 {
|
|
363 ReadPairInfo ReadPairInfo_tmp;
|
|
364 ReadPairInfo_tmp.IsoFlag.resize(2);
|
|
365 ReadPairInfo_tmp.Size.resize(2);
|
|
366 //ReadPairInfo_tmp.Pos.resize(4);
|
|
367 ReadPairInfo_tmp.Pos.assign(4,-1);
|
|
368 string eventid = it1->first;
|
|
369 if(read2_mappos.count(eventid) > 0)
|
|
370 {
|
|
371 ReadPairInfo_tmp.isPaired = 0;
|
|
372 for(int i=0;i<2;i++)
|
|
373 {
|
|
374 //cout<<read1_mappos[eventid][i]<<endl;
|
|
375 if(read2_mappos[eventid][i] == -1 || read1_mappos[eventid][i] == -1)
|
|
376 {
|
|
377 ReadPairInfo_tmp.IsoFlag[i] = 0;
|
|
378 ReadPairInfo_tmp.Size[i] = 0;
|
|
379 //discard reads like this
|
|
380 }
|
|
381 else
|
|
382 {
|
|
383
|
|
384 ReadPairInfo_tmp.isPaired = 1;
|
|
385 ReadPairInfo_tmp.IsoFlag[i] = 1;
|
|
386 ReadPairInfo_tmp.Size[i] = abs(read2_mappos[eventid][i]
|
|
387 -read1_mappos[eventid][i])
|
|
388 +read_size;
|
|
389 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i];
|
|
390 ReadPairInfo_tmp.Pos[2*i+1] = read2_mappos[eventid][i];
|
|
391 }
|
|
392 }
|
|
393 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp));
|
|
394 }
|
|
395 else
|
|
396 {
|
|
397 ReadPairInfo_tmp.isPaired = 0;
|
|
398 for(int i=0;i<2;i++)
|
|
399 {
|
|
400 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i];
|
|
401 ReadPairInfo_tmp.Size[i] = 0;
|
|
402 if(read1_mappos[eventid][i] == -1)
|
|
403 {
|
|
404 ReadPairInfo_tmp.IsoFlag[i] = 0;
|
|
405
|
|
406 }
|
|
407 else
|
|
408 {
|
|
409 ReadPairInfo_tmp.IsoFlag[i] = 1;
|
|
410 }
|
|
411 }
|
|
412 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp));
|
|
413 }
|
|
414 }
|
|
415 map<string, vector<int> >::iterator it2 = read2_mappos.begin();
|
|
416 for(;it2!=read2_mappos.end();it2++)
|
|
417 {
|
|
418 string eventid = it2->first;
|
|
419 if(mappinginfo.count(eventid)>0)
|
|
420 {
|
|
421 continue;
|
|
422 }
|
|
423 ReadPairInfo ReadPairInfo_tmp;
|
|
424 ReadPairInfo_tmp.IsoFlag.resize(2);
|
|
425 ReadPairInfo_tmp.Size.resize(2);
|
|
426 //ReadPairInfo_tmp.Pos.resize(4);
|
|
427 ReadPairInfo_tmp.Pos.assign(4,-1);
|
|
428 if(read1_mappos.count(eventid) > 0)
|
|
429 {
|
|
430 ReadPairInfo_tmp.isPaired = 0;
|
|
431 for(int i=0;i<2;i++)
|
|
432 {
|
|
433 if(read2_mappos[eventid][i] == -1 || read1_mappos[eventid][i] == -1)
|
|
434 {
|
|
435 ReadPairInfo_tmp.IsoFlag[i] = 0;
|
|
436 ReadPairInfo_tmp.Size[i] = 0;
|
|
437 }
|
|
438 else
|
|
439 {
|
|
440 ReadPairInfo_tmp.isPaired = 1;
|
|
441 ReadPairInfo_tmp.IsoFlag[i] = 1;
|
|
442 ReadPairInfo_tmp.Size[i] = abs(read2_mappos[eventid][i]
|
|
443 -read1_mappos[eventid][i]
|
|
444 +read_size);
|
|
445 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i];
|
|
446 ReadPairInfo_tmp.Pos[2*i+1]=read2_mappos[eventid][i];
|
|
447 }
|
|
448 }
|
|
449 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp));
|
|
450 }
|
|
451 else
|
|
452 {
|
|
453 ReadPairInfo_tmp.isPaired = 0;
|
|
454 for(int i=0;i<2;i++)
|
|
455 {
|
|
456 ReadPairInfo_tmp.Pos[2*i+1]=read2_mappos[eventid][i];
|
|
457 ReadPairInfo_tmp.Size[i] = 0;
|
|
458 if(read2_mappos[eventid][i] == -1)
|
|
459 {
|
|
460 ReadPairInfo_tmp.IsoFlag[i] = 0;
|
|
461 }
|
|
462 else
|
|
463 {
|
|
464 ReadPairInfo_tmp.IsoFlag[i] = 1;
|
|
465 }
|
|
466 }
|
|
467 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp));
|
|
468 }
|
|
469
|
|
470 }
|
|
471 }
|
|
472 };
|
|
473
|
|
474 int main (int argc, char **argv) {
|
|
475 int next_option;
|
|
476 int digit_optind = 0;
|
|
477 int read_size = 36;
|
|
478 char *dbfile_name = 0,
|
|
479 *IRM_file_name = 0,
|
|
480 *FZM_file_name = 0,
|
|
481 *output_file_prefix = 0,
|
|
482 *read1_nomt_file_name = 0,
|
|
483 *read2_nomt_file_name = 0;
|
|
484
|
|
485 while ( (next_option = getopt(argc, argv, "s:d:b:f:o:1:2:")) != -1) {
|
|
486 int this_option_optind = optind ? optind : 1;
|
|
487 switch (next_option) {
|
|
488 case 's': sscanf (optarg,"%d",&read_size); break;
|
|
489 case 'd': dbfile_name = optarg; break;
|
|
490 case 'b': IRM_file_name = optarg; break;
|
|
491 case 'f': FZM_file_name = optarg; break;
|
|
492 case 'o': output_file_prefix = optarg; break;
|
|
493 case '1': read1_nomt_file_name = optarg; break;
|
|
494 case '2': read2_nomt_file_name = optarg; break;
|
|
495 default: break;
|
|
496 }
|
|
497 }
|
|
498 if (!read_size||
|
|
499 !dbfile_name||
|
|
500 !FZM_file_name||
|
|
501 !output_file_prefix||
|
|
502 !read1_nomt_file_name||
|
|
503 !read2_nomt_file_name)
|
|
504 {
|
|
505 printusage();
|
|
506 return 0;
|
|
507 }
|
|
508 clock_t clock_start, clock_finish;
|
|
509 clock_start=clock();
|
|
510 string ratio_file_name = output_file_prefix;
|
|
511 ratio_file_name.append(".ratio");
|
|
512 string log_file_name = output_file_prefix;
|
|
513 log_file_name.append(".log");
|
|
514 string num_file_name = output_file_prefix;
|
|
515 num_file_name.append(".nums");
|
|
516
|
|
517 ofstream log_file(log_file_name.c_str());
|
|
518 cout<<"Log file name:\t"<<log_file_name<<endl;
|
|
519 cout<<"Parameters:"<<endl;
|
|
520 cout<<"Read size:\t"<<read_size<<endl;
|
|
521 log_file<<"#read_size:\t"<<read_size<<endl;
|
|
522 cout<<"Database:\t"<<dbfile_name<<endl;
|
|
523 log_file<<"#dbfile_name:\t"<<dbfile_name<<endl;
|
|
524 cout<<"Fragment size file name:\t"<<FZM_file_name<<endl;
|
|
525 log_file<<"#FZM_file_name:\t"<<FZM_file_name<<endl;
|
|
526 cout<<"Output file prefix:\t"<<output_file_prefix<<endl;
|
|
527 cout<<"Read end1 file:\t"<<read1_nomt_file_name<<endl;
|
|
528 cout<<"Read end2 file:\t"<<read2_nomt_file_name<<endl;
|
|
529 log_file<<"#read1_nomt_file_name:\t"<<read1_nomt_file_name<<endl;
|
|
530 log_file<<"#read2_nomt_file_name:\t"<<read2_nomt_file_name<<endl;
|
|
531 if(IRM_file_name)
|
|
532 {
|
|
533 cout<<"Inclusion ratio distn. file:\t"<<IRM_file_name<<endl;
|
|
534 log_file<<"#IRM_file_name:\t"<<IRM_file_name<<endl;
|
|
535
|
|
536 }
|
|
537 else
|
|
538 {
|
|
539 cout<<"Inclusion ratio distn. file:\tNA"<<endl;
|
|
540 log_file<<"#IRM_file_name:\tNA"<<endl;
|
|
541 }
|
|
542
|
|
543 //start reading files, dbfile first
|
|
544 cout<<"Reading database file..."<<endl;
|
|
545 map<string, TXdb_entry> TXdb_entries;
|
|
546 ifstream dbfile(dbfile_name);
|
|
547 char dbfile_line[MAX_LINE_LEN];
|
|
548 while(dbfile.getline(dbfile_line,MAX_LINE_LEN)){
|
|
549 TXdb_entry entry_tmp(dbfile_line);
|
|
550 if(entry_tmp.iscomplete)
|
|
551 {
|
|
552 TXdb_entries.insert(pair<string, TXdb_entry>
|
|
553 (entry_tmp.id,entry_tmp)
|
|
554 );
|
|
555 //cout<<entry_tmp.exonsize[1]<<endl;
|
|
556 }
|
|
557
|
|
558 }
|
|
559 dbfile.close();
|
|
560
|
|
561 //start reading bayes prior if there is
|
|
562 map<string, vector< double > > IRM;
|
|
563 IRM.insert(pair<string, vector<double> > ("CA", vector<double>()));
|
|
564 IRM.insert(pair<string, vector<double> > ("CS", vector<double>()));
|
|
565 IRM.insert(pair<string, vector<double> > ("IR", vector<double>()));
|
|
566 IRM.insert(pair<string, vector<double> > ("AA", vector<double>()));
|
|
567 IRM.insert(pair<string, vector<double> > ("AD", vector<double>()));
|
|
568 //IRM.insert(pair<string, vector<double> > ("AI", vector<double>()));
|
|
569 if(IRM_file_name)
|
|
570 {
|
|
571 cout<<"Reading inclusion ratio distn. file..."<<endl;
|
|
572 ifstream IRM_file(IRM_file_name);
|
|
573 char IRM_line[MAX_LINE_LEN];
|
|
574 IRM_file.getline(IRM_line, MAX_LINE_LEN);
|
|
575 while(IRM_file.getline(IRM_line, MAX_LINE_LEN))
|
|
576 {
|
|
577 char *token;
|
|
578 token = strtok(IRM_line, "\t");
|
|
579 if (token != NULL) {
|
|
580 double hist_tmp;
|
|
581 sscanf(token,"%lf",&hist_tmp);
|
|
582 IRM["CA"].push_back(hist_tmp);
|
|
583 }
|
|
584 token = strtok(NULL,"\t");
|
|
585 if (token != NULL) {
|
|
586 double hist_tmp;
|
|
587 sscanf(token,"%lf",&hist_tmp);
|
|
588 IRM["AD"].push_back(hist_tmp);
|
|
589 }
|
|
590 token = strtok(NULL,"\t");
|
|
591 if (token != NULL) {
|
|
592 double hist_tmp;
|
|
593 sscanf(token,"%lf",&hist_tmp);
|
|
594 IRM["AA"].push_back(hist_tmp);
|
|
595 }
|
|
596 /* token = strtok(NULL,"\t");
|
|
597 if (token != NULL) {
|
|
598 double hist_tmp;
|
|
599 sscanf(token,"%lf",&hist_tmp);
|
|
600 IRM["AI"].push_back(hist_tmp);
|
|
601 }*/
|
|
602 token = strtok(NULL,"\t");
|
|
603 if (token != NULL) {
|
|
604 double hist_tmp;
|
|
605 sscanf(token,"%lf",&hist_tmp);
|
|
606 IRM["IR"].push_back(hist_tmp);
|
|
607 }
|
|
608 token = strtok(NULL,"\t");
|
|
609 if (token != NULL) {
|
|
610 double hist_tmp;
|
|
611 sscanf(token,"%lf",&hist_tmp);
|
|
612 IRM["CS"].push_back(hist_tmp);
|
|
613 }
|
|
614
|
|
615 }
|
|
616 IRM_file.close();
|
|
617 }
|
|
618
|
|
619 //read FZM
|
|
620 cout<<"Reading fragment size distn. file..."<<endl;
|
|
621 ifstream FZM_file(FZM_file_name);
|
|
622 vector<double> FZM;
|
|
623 char FZM_file_line[MAX_LINE_LEN];
|
|
624 FZM_file.getline(FZM_file_line,MAX_LINE_LEN);
|
|
625 while(FZM_file.getline (FZM_file_line,MAX_LINE_LEN) )
|
|
626 {
|
|
627 double hist_tmp;
|
|
628 sscanf(FZM_file_line, "%lf", &hist_tmp) ;
|
|
629 FZM.push_back(hist_tmp);
|
|
630 }
|
|
631 FZM_file.close();
|
|
632 //start to read reads
|
|
633 cout<<"Reading reads from both ends...."<<endl;
|
|
634 ifstream read1_nomt_file(read1_nomt_file_name);
|
|
635 ifstream read2_nomt_file(read2_nomt_file_name);
|
|
636 char read1_line[MAX_LINE_LEN];
|
|
637 char read2_line[MAX_LINE_LEN];
|
|
638 int total_single_read_num=0;
|
|
639 int total_pair_read_num=0;
|
|
640 int total_read_num=0;
|
|
641 while(read1_nomt_file.getline(read1_line, MAX_LINE_LEN))
|
|
642 {
|
|
643 total_read_num++;
|
|
644 ReadInfo read1(read1_line);
|
|
645 map<string, vector<int> > read1_mappos;
|
|
646 if(read1.isUseful)
|
|
647 {
|
|
648 read1.scan(read1_mappos);
|
|
649 }
|
|
650 read2_nomt_file.getline(read2_line, MAX_LINE_LEN);
|
|
651 ReadInfo read2(read2_line);
|
|
652 map<string, vector<int> > read2_mappos;
|
|
653 if(read2.isUseful)
|
|
654 {
|
|
655 read2.scan(read2_mappos);
|
|
656 }
|
|
657 //pairing
|
|
658 ReadPair read_pair_ins (read1_mappos, read2_mappos, read_size);
|
|
659 //feed them into TXdb_entries
|
|
660 map<string , ReadPairInfo>::iterator it = read_pair_ins.mappinginfo.begin();
|
|
661 for(; it!=read_pair_ins.mappinginfo.end();it++)
|
|
662 {
|
|
663 string eventid = it->first;
|
|
664 map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.find(eventid);
|
|
665 if(txdb_it == TXdb_entries.end() )
|
|
666 {
|
|
667 continue;
|
|
668 }
|
|
669 //get numbers
|
|
670 for(int i=0; i<4;i++)
|
|
671 {
|
|
672 if(read_pair_ins.mappinginfo[eventid].Pos[i]!=-1)
|
|
673 {
|
|
674 int pos_tmp1 = read_pair_ins.mappinginfo[eventid].Pos[i];
|
|
675 int pos_tmp2;
|
|
676 if(i<2)
|
|
677 {
|
|
678 if (txdb_it->second.strand == '-')
|
|
679 pos_tmp1 = txdb_it->second.len1 -pos_tmp1 +1- read_size;
|
|
680 pos_tmp2 = pos_tmp1 + read_size;
|
|
681 if (pos_tmp2 <= txdb_it->second.exonsize[0])
|
|
682 { txdb_it->second.enum1++;continue;}
|
|
683 if (pos_tmp1 <= txdb_it->second.exonsize[0] )
|
|
684 {
|
|
685 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] !=1)
|
|
686 {
|
|
687 txdb_it->second.jnum12++;
|
|
688 }
|
|
689 else
|
|
690 {
|
|
691 txdb_it->second.enum1++;
|
|
692 }
|
|
693 continue;
|
|
694 }
|
|
695 if (pos_tmp2 <= txdb_it->second.exonsize[0]+txdb_it->second.exonsize[1])
|
|
696 { txdb_it->second.enum2++;continue;}
|
|
697 if (pos_tmp1 <= txdb_it->second.exonsize[0]+txdb_it->second.exonsize[1])
|
|
698 {
|
|
699 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] !=1)
|
|
700 {
|
|
701 txdb_it->second.jnum23++;
|
|
702 }
|
|
703 else
|
|
704 {
|
|
705 txdb_it->second.enum2++;
|
|
706 }
|
|
707 continue;
|
|
708
|
|
709 }
|
|
710 txdb_it->second.enum3++;
|
|
711 }
|
|
712 else
|
|
713 {
|
|
714 if (read_pair_ins.mappinginfo[eventid].IsoFlag[0] == 1
|
|
715 && read_pair_ins.mappinginfo[eventid].IsoFlag[1] == 1)
|
|
716 continue;
|
|
717 if (txdb_it->second.strand == '-')
|
|
718 pos_tmp1 = txdb_it->second.len2 -pos_tmp1 +1- read_size;
|
|
719 pos_tmp2 = pos_tmp1 + read_size;
|
|
720 if (pos_tmp2 <= txdb_it->second.exonsize[0])
|
|
721 { txdb_it->second.enum1++;continue;}
|
|
722 if (pos_tmp1 <= txdb_it->second.exonsize[0] )
|
|
723 {
|
|
724 if(read_pair_ins.mappinginfo[eventid].IsoFlag[0] !=1)
|
|
725 {
|
|
726 txdb_it->second.jnum13++;
|
|
727 }
|
|
728 else
|
|
729 {
|
|
730 txdb_it->second.enum1++;
|
|
731 }
|
|
732 continue;
|
|
733 }
|
|
734 txdb_it->second.enum3++;
|
|
735
|
|
736
|
|
737 }
|
|
738 }
|
|
739 }
|
|
740
|
|
741 if(read_pair_ins.mappinginfo[eventid].isPaired)
|
|
742 {
|
|
743 total_pair_read_num += 2;
|
|
744 txdb_it->second.totalreadnum += 2;
|
|
745 (txdb_it->second).pairs.push_back(read_pair_ins.mappinginfo[eventid]);
|
|
746 }
|
|
747 else
|
|
748 {
|
|
749 total_single_read_num += 2;
|
|
750 txdb_it->second.totalreadnum += 1;
|
|
751 if(read_pair_ins.mappinginfo[eventid].IsoFlag[0] == 1)
|
|
752 {
|
|
753 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] == 0)
|
|
754 {
|
|
755 (txdb_it->second).snum1++;
|
|
756 }
|
|
757 else
|
|
758 {
|
|
759 (txdb_it->second).snum12++;
|
|
760 }
|
|
761 }
|
|
762 else
|
|
763 {
|
|
764 (txdb_it->second).snum2++;
|
|
765 }
|
|
766 }
|
|
767 }
|
|
768
|
|
769
|
|
770 }
|
|
771
|
|
772 read1_nomt_file.close();
|
|
773 read2_nomt_file.close();
|
|
774 cout<<"Total read number:\t"<<total_read_num*2<<endl;
|
|
775 cout<<"Singled reads used:\t"<<total_single_read_num<<endl;
|
|
776 cout<<"Paired reads used:\t"<<total_pair_read_num<<endl;
|
|
777
|
|
778 log_file<<"#total_read_num:\t"<<total_read_num*2<<endl;
|
|
779 log_file<<"#total_single_read_num:\t"<<total_single_read_num<<endl;
|
|
780 log_file<<"#total_pair_read_num:\t"<<total_pair_read_num<<endl;
|
|
781
|
|
782 //Calculate ratios and output
|
|
783 cout<<"Now doing estimations and output to "<<ratio_file_name<<"\t"<<num_file_name<<endl;
|
|
784 ofstream ratio_file(ratio_file_name.c_str());
|
|
785 ofstream num_file(num_file_name.c_str());
|
|
786 map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.begin();
|
|
787 for(;txdb_it!=TXdb_entries.end();txdb_it++)
|
|
788 {
|
|
789 string eventid = txdb_it->first;
|
|
790 //cout<< eventid <<endl;
|
|
791 txdb_it->second.estimate(FZM,IRM,read_size);
|
|
792 /* ratio_file<< eventid<<"\t"
|
|
793 <<setprecision(4)<<fixed
|
|
794 <<txdb_it->second.ir<<"\t"
|
|
795 <<txdb_it->second.bir<<"\t"
|
|
796 <<"NA\tNA\tNA\t"
|
|
797 <<txdb_it->second.jnum12<<"\t"
|
|
798 <<txdb_it->second.jnum23<<"\t"
|
|
799 <<txdb_it->second.jnum13<<"\t"
|
|
800 <<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\t"
|
|
801 <<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\t"
|
|
802 <<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\t"
|
|
803 <<txdb_it->second.exonsize[0]<<"\t"
|
|
804 <<txdb_it->second.exonsize[1]<<"\t"
|
|
805 <<txdb_it->second.exonsize[2]<<"\t"
|
|
806 <<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<endl;
|
|
807 */
|
|
808 ratio_file<<eventid<<"\t"
|
|
809 <<setprecision(4)<<fixed
|
|
810 <<txdb_it->second.ir<<"\t" //1
|
|
811 <<txdb_it->second.bir<<"\t" //2
|
|
812 <<txdb_it->second.chrid<<"\t" //3
|
|
813 <<txdb_it->second.start<<"," //
|
|
814 <<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\t" //4
|
|
815 <<txdb_it->second.start+txdb_it->second.exonstarts[1]<<","
|
|
816 <<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\t"//5
|
|
817 <<txdb_it->second.start+txdb_it->second.exonstarts[2]<<","
|
|
818 <<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\t"//6
|
|
819 <<txdb_it->second.strand<<"\t"//7
|
|
820 <<txdb_it->second.jnum12<<"\t"//8
|
|
821 <<txdb_it->second.jnum23<<"\t"//9
|
|
822 <<txdb_it->second.jnum13<<"\t"//10
|
|
823 <<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\t"//11
|
|
824 <<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\t"//12
|
|
825 <<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\t"//13`
|
|
826 <<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<"\t"//14
|
|
827 <<txdb_it->second.exonsize[0]<<"\t"//15
|
|
828 <<txdb_it->second.exonsize[1]<<"\t"//16
|
|
829 <<txdb_it->second.exonsize[2]<<endl;//17
|
|
830
|
|
831 num_file<< eventid<<"\t"
|
|
832 <<txdb_it->second.chrid<<"\t"
|
|
833 <<txdb_it->second.start<<","
|
|
834 <<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\t"
|
|
835 <<txdb_it->second.start+txdb_it->second.exonstarts[1]<<","
|
|
836 <<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\t"
|
|
837 <<txdb_it->second.start+txdb_it->second.exonstarts[2]<<","
|
|
838 <<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\t"
|
|
839 <<txdb_it->second.strand<<"\t"
|
|
840 <<txdb_it->second.enum1<<"\t"
|
|
841 <<txdb_it->second.enum2<<"\t"
|
|
842 <<txdb_it->second.enum3<<"\t"
|
|
843 <<txdb_it->second.jnum12<<"\t"
|
|
844 <<txdb_it->second.jnum23<<"\t"
|
|
845 <<txdb_it->second.jnum13<<endl;
|
|
846 }
|
|
847 log_file.close();
|
|
848 ratio_file.close();
|
|
849 num_file.close();
|
|
850 clock_finish = clock();
|
|
851 cout<<"Done! time used:"<<(double)(clock_finish-clock_start)/CLOCKS_PER_SEC<<" seconds"<<endl;
|
|
852
|
|
853
|
|
854 }
|