Mercurial > repos > bioitcore > splicetrap
comparison src/splicetrap.estimate.cpp @ 1:adc0f7765d85 draft
planemo upload
author | bioitcore |
---|---|
date | Thu, 07 Sep 2017 15:06:58 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d4ca551ca300 | 1:adc0f7765d85 |
---|---|
1 //Author: Jie Wu@CSHL | |
2 //TO replace the original Perl code | |
3 | |
4 #include <stdio.h> | |
5 #include <stdlib.h> | |
6 #include <iostream> | |
7 #include <fstream> | |
8 #include <string.h> | |
9 #include <unistd.h> | |
10 #include <sstream> | |
11 #include <map> | |
12 #include <vector> | |
13 #include <math.h> | |
14 #include <time.h> | |
15 #include <limits> | |
16 #include <iomanip> | |
17 | |
18 using namespace std; | |
19 | |
20 int MAX_LINE_LEN = 1024*1024; | |
21 | |
22 void printusage() | |
23 { | |
24 cout<< "\tUsage:"<<endl; | |
25 cout<<"\t-s\tread_size"<<endl; | |
26 cout<<"\t-b\tIRM file"<<endl; | |
27 cout<<"\t-f\tFZM file"<<endl; | |
28 cout<<"\t-1\tmapping result 1"<<endl; | |
29 cout<<"\t-2\tmapping result 2"<<endl; | |
30 cout<<"\t-d\tTXdb database file"<<endl; | |
31 cout<<"\t-o\toutput prefix"<<endl; | |
32 | |
33 } | |
34 | |
35 struct ReadPairInfo | |
36 { | |
37 bool isPaired; | |
38 vector<int> IsoFlag; //flag as 1 if mapped to the isoform | |
39 vector<int> Size; //fragment size in the isoform | |
40 vector<int> Pos; | |
41 }; | |
42 | |
43 class TXdb_entry{ | |
44 public: | |
45 string chrid; | |
46 //int start; | |
47 //int end; | |
48 string id; | |
49 bool iscomplete; | |
50 int start; | |
51 int end; | |
52 int exonstarts[3]; | |
53 char strand; | |
54 int exonsize[3]; | |
55 int len1; | |
56 int len2; | |
57 vector< ReadPairInfo > pairs; | |
58 int snum1; //single-end reads | |
59 int snum2; | |
60 int snum12; | |
61 float ir, bir; | |
62 int enum1; //exon body reads | |
63 int enum2; | |
64 int enum3; | |
65 | |
66 int jnum12; | |
67 int jnum23; | |
68 int jnum13; | |
69 | |
70 int totalreadnum; | |
71 void estimate(vector<double> &FZM, map<string, vector<double> > &IRM, int &read_size) | |
72 { | |
73 float Maxe=0; | |
74 float Max = -numeric_limits<float>::max(); | |
75 float BMaxe=0; | |
76 float BMax=-numeric_limits<float>::max(); | |
77 for(float e1=0.001; e1<1;e1=e1+0.001) | |
78 { | |
79 float efix1 = e1*len1/ | |
80 (e1*exonsize[1]+len2); | |
81 float efix2 = 1-efix1; | |
82 float LL = snum12*log( efix1/(len1-read_size+1) | |
83 + efix2/(len2-read_size+1) ) | |
84 + snum1*log(efix1/(len1-read_size+1)) | |
85 + snum2*log(efix2/(len2-read_size+1)); | |
86 //cout<<LL<<endl; | |
87 //cout<<efix1<<"\t" <<pairs.size()<<"\t" <<len2<<"\t" <<endl; | |
88 //int num1=0, num2=0,num12=0; | |
89 for(int i=0;i<pairs.size();i++) | |
90 { | |
91 if(pairs[i].IsoFlag[0] == 1) | |
92 { | |
93 if(pairs[i].IsoFlag[1] == 0) | |
94 { | |
95 // num1++; | |
96 float tmp=FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1); | |
97 if (tmp == 0) | |
98 { | |
99 LL = LL - 308; | |
100 } | |
101 else | |
102 { | |
103 LL = LL + log(tmp); | |
104 } | |
105 } | |
106 else | |
107 { | |
108 // num12++; | |
109 float tmp=(FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1)) | |
110 + (FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1)); | |
111 if(tmp ==0) | |
112 { | |
113 LL = LL -308; | |
114 } | |
115 else | |
116 { | |
117 LL = LL + log(tmp); | |
118 } | |
119 } | |
120 } | |
121 else | |
122 { | |
123 //num2++; | |
124 float tmp = FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1); | |
125 if(tmp ==0) | |
126 { | |
127 LL= LL - 308; | |
128 } | |
129 else | |
130 { | |
131 LL = LL +log(tmp); | |
132 } | |
133 } | |
134 } | |
135 //cout<<num1<<"\t"<<num2<<"\t"<<num12<<"\n"; | |
136 //cout<<LL<<endl; | |
137 | |
138 if(IRM["CA"].size() > 0) | |
139 { | |
140 float BLL; | |
141 string eventtype =id.substr(0,2); | |
142 if(!eventtype.compare("ME")) | |
143 { | |
144 eventtype = "CA"; | |
145 } | |
146 BLL = LL + log(IRM[eventtype][int(e1/0.001)]); | |
147 if(BLL > BMax) | |
148 { | |
149 BMax = BLL; | |
150 BMaxe = e1; | |
151 } | |
152 } | |
153 //cout<<LL<<endl; | |
154 if(LL > Max) | |
155 { | |
156 Max= LL; | |
157 Maxe = e1; | |
158 | |
159 } | |
160 | |
161 } | |
162 ir= Maxe; | |
163 bir= BMaxe; | |
164 } | |
165 | |
166 TXdb_entry(char *line) | |
167 { | |
168 size_t found; | |
169 string linestr(line); | |
170 found = linestr.find("[L]"); | |
171 if(found != string::npos) | |
172 { | |
173 iscomplete=1; | |
174 | |
175 char *token; | |
176 | |
177 token = strtok(line,"\t"); | |
178 if(token==NULL) | |
179 { | |
180 cout<<"error1"<<endl; | |
181 exit(0); | |
182 } | |
183 else | |
184 { | |
185 chrid = token; | |
186 } | |
187 token = strtok(NULL,"\t"); | |
188 if(token==NULL) | |
189 { | |
190 cout<<"error2"<<endl; | |
191 exit(0); | |
192 } | |
193 else | |
194 { | |
195 sscanf(token,"%d",&start); | |
196 } | |
197 token = strtok(NULL,"\t"); | |
198 if(token==NULL) | |
199 { | |
200 cout<<"error3"<<endl; | |
201 exit(0); | |
202 } | |
203 else | |
204 { | |
205 sscanf(token,"%d",&end); | |
206 } | |
207 token = strtok(NULL,"\t"); | |
208 if(token==NULL) | |
209 { | |
210 cout<<"error4"<<endl;exit(0); | |
211 } | |
212 else | |
213 { | |
214 id = token; | |
215 id = id.substr(0,id.size()-3); | |
216 } | |
217 token = strtok(NULL,"\t"); | |
218 token = strtok(NULL,"\t"); | |
219 if(token==NULL) | |
220 { | |
221 cout<<"error5"<<endl;exit(0); | |
222 } | |
223 else | |
224 { | |
225 sscanf(token,"%c",&strand); | |
226 } | |
227 for(int i=0;i<5;i++) | |
228 { | |
229 token=strtok(NULL,"\t"); | |
230 } | |
231 if(token==NULL){ | |
232 cout<<"error6"<<endl;exit(0); | |
233 } | |
234 else | |
235 { | |
236 string str_tmp(token); | |
237 size_t found_tmp; | |
238 int *pexonsize = exonsize; | |
239 while((found_tmp = str_tmp.find(",")) != string::npos){ | |
240 sscanf((str_tmp.substr(0,found_tmp)).c_str(),"%d",pexonsize); | |
241 pexonsize++; | |
242 str_tmp.erase(0,found_tmp+1); | |
243 } | |
244 sscanf(str_tmp.c_str(),"%d",pexonsize); | |
245 | |
246 } | |
247 token=strtok(NULL,"\t"); | |
248 | |
249 if(token==NULL){ | |
250 cout<<"error7"<<endl;exit(0); | |
251 } | |
252 else | |
253 { | |
254 string str_tmp(token); | |
255 size_t found_tmp; | |
256 int *pexonstarts = exonstarts; | |
257 while((found_tmp = str_tmp.find(",")) != string::npos){ | |
258 sscanf((str_tmp.substr(0,found_tmp)).c_str(),"%d",pexonstarts); | |
259 pexonstarts++; | |
260 str_tmp.erase(0,found_tmp+1); | |
261 } | |
262 sscanf(str_tmp.c_str(),"%d",pexonstarts); | |
263 | |
264 } | |
265 len1 = exonsize[0]+exonsize[1]+exonsize[2]; | |
266 len2 = exonsize[0]+exonsize[2]; | |
267 snum12=0; | |
268 snum1 =0; | |
269 snum2 =0; | |
270 enum1=0; | |
271 enum2=0; | |
272 enum3=0; | |
273 jnum12=0; | |
274 jnum13=0; | |
275 jnum23=0; | |
276 totalreadnum=0; | |
277 } | |
278 else | |
279 { | |
280 iscomplete = 0; | |
281 } | |
282 } | |
283 }; | |
284 | |
285 class ReadInfo | |
286 { | |
287 public: | |
288 bool isUseful; | |
289 char *pline; | |
290 void scan(map<string, vector<int> > &map_pos ) | |
291 //eventname, pos in L/S iso | |
292 { | |
293 char *token; | |
294 token = strtok(pline, "\t"); | |
295 token = strtok(NULL, "\t"); | |
296 token = strtok(NULL, "\t"); | |
297 token = strtok(NULL, "\t"); | |
298 string mapinfo(token); | |
299 size_t beg, end; | |
300 end = 0; | |
301 while( (beg= mapinfo.find_first_of("/", end))!=string::npos) | |
302 { | |
303 beg ++; | |
304 end = mapinfo.find_first_of("[",beg); | |
305 string eventid = mapinfo.substr( beg, end-beg ) ; | |
306 vector<int> pos(2,-1); | |
307 if(!map_pos.count(eventid)) | |
308 { | |
309 map_pos.insert(pair<string, vector<int> > | |
310 (eventid, pos) ); | |
311 } | |
312 end ++; | |
313 string LorS = mapinfo.substr( end,1 ); | |
314 beg = mapinfo.find_first_of(":",end ); | |
315 beg ++; | |
316 end = mapinfo.find_first_of(",",beg ); | |
317 string pos_str(mapinfo.substr(beg, end-2-beg)); | |
318 | |
319 if(LorS.compare("L") == 0) | |
320 { | |
321 sscanf(pos_str.c_str(),"%d",&(map_pos[eventid][0]) ); | |
322 } | |
323 else | |
324 { | |
325 sscanf(pos_str.c_str(),"%d",&(map_pos[eventid][1]) ); | |
326 } | |
327 | |
328 //cout<<eventid<<"\t"<<map_pos[eventid][0]<<"\t"<<map_pos[eventid][1]<<endl; | |
329 } | |
330 | |
331 } | |
332 ReadInfo(char *line) | |
333 { | |
334 string line_str(line); | |
335 string line_str_last2=line_str.substr(line_str.size()-2,2); | |
336 if(line_str_last2.compare("NM") == 0 ) | |
337 { | |
338 isUseful = 0; | |
339 } | |
340 else if(line_str_last2.compare("MT") == 0 ) | |
341 { | |
342 isUseful = 0; | |
343 } | |
344 else | |
345 { | |
346 isUseful = 1; | |
347 pline = line; | |
348 } | |
349 } | |
350 }; | |
351 | |
352 | |
353 | |
354 class ReadPair | |
355 { | |
356 public: | |
357 map<string, ReadPairInfo> mappinginfo; | |
358 ReadPair(map<string, vector<int> > &read1_mappos, map<string, vector<int> > &read2_mappos, int &read_size) | |
359 { | |
360 map<string, vector<int> >::iterator it1 = read1_mappos.begin(); | |
361 for(;it1!=read1_mappos.end();it1++) | |
362 { | |
363 ReadPairInfo ReadPairInfo_tmp; | |
364 ReadPairInfo_tmp.IsoFlag.resize(2); | |
365 ReadPairInfo_tmp.Size.resize(2); | |
366 //ReadPairInfo_tmp.Pos.resize(4); | |
367 ReadPairInfo_tmp.Pos.assign(4,-1); | |
368 string eventid = it1->first; | |
369 if(read2_mappos.count(eventid) > 0) | |
370 { | |
371 ReadPairInfo_tmp.isPaired = 0; | |
372 for(int i=0;i<2;i++) | |
373 { | |
374 //cout<<read1_mappos[eventid][i]<<endl; | |
375 if(read2_mappos[eventid][i] == -1 || read1_mappos[eventid][i] == -1) | |
376 { | |
377 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
378 ReadPairInfo_tmp.Size[i] = 0; | |
379 //discard reads like this | |
380 } | |
381 else | |
382 { | |
383 | |
384 ReadPairInfo_tmp.isPaired = 1; | |
385 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
386 ReadPairInfo_tmp.Size[i] = abs(read2_mappos[eventid][i] | |
387 -read1_mappos[eventid][i]) | |
388 +read_size; | |
389 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i]; | |
390 ReadPairInfo_tmp.Pos[2*i+1] = read2_mappos[eventid][i]; | |
391 } | |
392 } | |
393 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
394 } | |
395 else | |
396 { | |
397 ReadPairInfo_tmp.isPaired = 0; | |
398 for(int i=0;i<2;i++) | |
399 { | |
400 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i]; | |
401 ReadPairInfo_tmp.Size[i] = 0; | |
402 if(read1_mappos[eventid][i] == -1) | |
403 { | |
404 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
405 | |
406 } | |
407 else | |
408 { | |
409 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
410 } | |
411 } | |
412 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
413 } | |
414 } | |
415 map<string, vector<int> >::iterator it2 = read2_mappos.begin(); | |
416 for(;it2!=read2_mappos.end();it2++) | |
417 { | |
418 string eventid = it2->first; | |
419 if(mappinginfo.count(eventid)>0) | |
420 { | |
421 continue; | |
422 } | |
423 ReadPairInfo ReadPairInfo_tmp; | |
424 ReadPairInfo_tmp.IsoFlag.resize(2); | |
425 ReadPairInfo_tmp.Size.resize(2); | |
426 //ReadPairInfo_tmp.Pos.resize(4); | |
427 ReadPairInfo_tmp.Pos.assign(4,-1); | |
428 if(read1_mappos.count(eventid) > 0) | |
429 { | |
430 ReadPairInfo_tmp.isPaired = 0; | |
431 for(int i=0;i<2;i++) | |
432 { | |
433 if(read2_mappos[eventid][i] == -1 || read1_mappos[eventid][i] == -1) | |
434 { | |
435 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
436 ReadPairInfo_tmp.Size[i] = 0; | |
437 } | |
438 else | |
439 { | |
440 ReadPairInfo_tmp.isPaired = 1; | |
441 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
442 ReadPairInfo_tmp.Size[i] = abs(read2_mappos[eventid][i] | |
443 -read1_mappos[eventid][i] | |
444 +read_size); | |
445 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i]; | |
446 ReadPairInfo_tmp.Pos[2*i+1]=read2_mappos[eventid][i]; | |
447 } | |
448 } | |
449 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
450 } | |
451 else | |
452 { | |
453 ReadPairInfo_tmp.isPaired = 0; | |
454 for(int i=0;i<2;i++) | |
455 { | |
456 ReadPairInfo_tmp.Pos[2*i+1]=read2_mappos[eventid][i]; | |
457 ReadPairInfo_tmp.Size[i] = 0; | |
458 if(read2_mappos[eventid][i] == -1) | |
459 { | |
460 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
461 } | |
462 else | |
463 { | |
464 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
465 } | |
466 } | |
467 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
468 } | |
469 | |
470 } | |
471 } | |
472 }; | |
473 | |
474 int main (int argc, char **argv) { | |
475 int next_option; | |
476 int digit_optind = 0; | |
477 int read_size = 36; | |
478 char *dbfile_name = 0, | |
479 *IRM_file_name = 0, | |
480 *FZM_file_name = 0, | |
481 *output_file_prefix = 0, | |
482 *read1_nomt_file_name = 0, | |
483 *read2_nomt_file_name = 0; | |
484 | |
485 while ( (next_option = getopt(argc, argv, "s:d:b:f:o:1:2:")) != -1) { | |
486 int this_option_optind = optind ? optind : 1; | |
487 switch (next_option) { | |
488 case 's': sscanf (optarg,"%d",&read_size); break; | |
489 case 'd': dbfile_name = optarg; break; | |
490 case 'b': IRM_file_name = optarg; break; | |
491 case 'f': FZM_file_name = optarg; break; | |
492 case 'o': output_file_prefix = optarg; break; | |
493 case '1': read1_nomt_file_name = optarg; break; | |
494 case '2': read2_nomt_file_name = optarg; break; | |
495 default: break; | |
496 } | |
497 } | |
498 if (!read_size|| | |
499 !dbfile_name|| | |
500 !FZM_file_name|| | |
501 !output_file_prefix|| | |
502 !read1_nomt_file_name|| | |
503 !read2_nomt_file_name) | |
504 { | |
505 printusage(); | |
506 return 0; | |
507 } | |
508 clock_t clock_start, clock_finish; | |
509 clock_start=clock(); | |
510 string ratio_file_name = output_file_prefix; | |
511 ratio_file_name.append(".ratio"); | |
512 string log_file_name = output_file_prefix; | |
513 log_file_name.append(".log"); | |
514 string num_file_name = output_file_prefix; | |
515 num_file_name.append(".nums"); | |
516 | |
517 ofstream log_file(log_file_name.c_str()); | |
518 cout<<"Log file name:\t"<<log_file_name<<endl; | |
519 cout<<"Parameters:"<<endl; | |
520 cout<<"Read size:\t"<<read_size<<endl; | |
521 log_file<<"#read_size:\t"<<read_size<<endl; | |
522 cout<<"Database:\t"<<dbfile_name<<endl; | |
523 log_file<<"#dbfile_name:\t"<<dbfile_name<<endl; | |
524 cout<<"Fragment size file name:\t"<<FZM_file_name<<endl; | |
525 log_file<<"#FZM_file_name:\t"<<FZM_file_name<<endl; | |
526 cout<<"Output file prefix:\t"<<output_file_prefix<<endl; | |
527 cout<<"Read end1 file:\t"<<read1_nomt_file_name<<endl; | |
528 cout<<"Read end2 file:\t"<<read2_nomt_file_name<<endl; | |
529 log_file<<"#read1_nomt_file_name:\t"<<read1_nomt_file_name<<endl; | |
530 log_file<<"#read2_nomt_file_name:\t"<<read2_nomt_file_name<<endl; | |
531 if(IRM_file_name) | |
532 { | |
533 cout<<"Inclusion ratio distn. file:\t"<<IRM_file_name<<endl; | |
534 log_file<<"#IRM_file_name:\t"<<IRM_file_name<<endl; | |
535 | |
536 } | |
537 else | |
538 { | |
539 cout<<"Inclusion ratio distn. file:\tNA"<<endl; | |
540 log_file<<"#IRM_file_name:\tNA"<<endl; | |
541 } | |
542 | |
543 //start reading files, dbfile first | |
544 cout<<"Reading database file..."<<endl; | |
545 map<string, TXdb_entry> TXdb_entries; | |
546 ifstream dbfile(dbfile_name); | |
547 char dbfile_line[MAX_LINE_LEN]; | |
548 while(dbfile.getline(dbfile_line,MAX_LINE_LEN)){ | |
549 TXdb_entry entry_tmp(dbfile_line); | |
550 if(entry_tmp.iscomplete) | |
551 { | |
552 TXdb_entries.insert(pair<string, TXdb_entry> | |
553 (entry_tmp.id,entry_tmp) | |
554 ); | |
555 //cout<<entry_tmp.exonsize[1]<<endl; | |
556 } | |
557 | |
558 } | |
559 dbfile.close(); | |
560 | |
561 //start reading bayes prior if there is | |
562 map<string, vector< double > > IRM; | |
563 IRM.insert(pair<string, vector<double> > ("CA", vector<double>())); | |
564 IRM.insert(pair<string, vector<double> > ("CS", vector<double>())); | |
565 IRM.insert(pair<string, vector<double> > ("IR", vector<double>())); | |
566 IRM.insert(pair<string, vector<double> > ("AA", vector<double>())); | |
567 IRM.insert(pair<string, vector<double> > ("AD", vector<double>())); | |
568 //IRM.insert(pair<string, vector<double> > ("AI", vector<double>())); | |
569 if(IRM_file_name) | |
570 { | |
571 cout<<"Reading inclusion ratio distn. file..."<<endl; | |
572 ifstream IRM_file(IRM_file_name); | |
573 char IRM_line[MAX_LINE_LEN]; | |
574 IRM_file.getline(IRM_line, MAX_LINE_LEN); | |
575 while(IRM_file.getline(IRM_line, MAX_LINE_LEN)) | |
576 { | |
577 char *token; | |
578 token = strtok(IRM_line, "\t"); | |
579 if (token != NULL) { | |
580 double hist_tmp; | |
581 sscanf(token,"%lf",&hist_tmp); | |
582 IRM["CA"].push_back(hist_tmp); | |
583 } | |
584 token = strtok(NULL,"\t"); | |
585 if (token != NULL) { | |
586 double hist_tmp; | |
587 sscanf(token,"%lf",&hist_tmp); | |
588 IRM["AD"].push_back(hist_tmp); | |
589 } | |
590 token = strtok(NULL,"\t"); | |
591 if (token != NULL) { | |
592 double hist_tmp; | |
593 sscanf(token,"%lf",&hist_tmp); | |
594 IRM["AA"].push_back(hist_tmp); | |
595 } | |
596 /* token = strtok(NULL,"\t"); | |
597 if (token != NULL) { | |
598 double hist_tmp; | |
599 sscanf(token,"%lf",&hist_tmp); | |
600 IRM["AI"].push_back(hist_tmp); | |
601 }*/ | |
602 token = strtok(NULL,"\t"); | |
603 if (token != NULL) { | |
604 double hist_tmp; | |
605 sscanf(token,"%lf",&hist_tmp); | |
606 IRM["IR"].push_back(hist_tmp); | |
607 } | |
608 token = strtok(NULL,"\t"); | |
609 if (token != NULL) { | |
610 double hist_tmp; | |
611 sscanf(token,"%lf",&hist_tmp); | |
612 IRM["CS"].push_back(hist_tmp); | |
613 } | |
614 | |
615 } | |
616 IRM_file.close(); | |
617 } | |
618 | |
619 //read FZM | |
620 cout<<"Reading fragment size distn. file..."<<endl; | |
621 ifstream FZM_file(FZM_file_name); | |
622 vector<double> FZM; | |
623 char FZM_file_line[MAX_LINE_LEN]; | |
624 FZM_file.getline(FZM_file_line,MAX_LINE_LEN); | |
625 while(FZM_file.getline (FZM_file_line,MAX_LINE_LEN) ) | |
626 { | |
627 double hist_tmp; | |
628 sscanf(FZM_file_line, "%lf", &hist_tmp) ; | |
629 FZM.push_back(hist_tmp); | |
630 } | |
631 FZM_file.close(); | |
632 //start to read reads | |
633 cout<<"Reading reads from both ends...."<<endl; | |
634 ifstream read1_nomt_file(read1_nomt_file_name); | |
635 ifstream read2_nomt_file(read2_nomt_file_name); | |
636 char read1_line[MAX_LINE_LEN]; | |
637 char read2_line[MAX_LINE_LEN]; | |
638 int total_single_read_num=0; | |
639 int total_pair_read_num=0; | |
640 int total_read_num=0; | |
641 while(read1_nomt_file.getline(read1_line, MAX_LINE_LEN)) | |
642 { | |
643 total_read_num++; | |
644 ReadInfo read1(read1_line); | |
645 map<string, vector<int> > read1_mappos; | |
646 if(read1.isUseful) | |
647 { | |
648 read1.scan(read1_mappos); | |
649 } | |
650 read2_nomt_file.getline(read2_line, MAX_LINE_LEN); | |
651 ReadInfo read2(read2_line); | |
652 map<string, vector<int> > read2_mappos; | |
653 if(read2.isUseful) | |
654 { | |
655 read2.scan(read2_mappos); | |
656 } | |
657 //pairing | |
658 ReadPair read_pair_ins (read1_mappos, read2_mappos, read_size); | |
659 //feed them into TXdb_entries | |
660 map<string , ReadPairInfo>::iterator it = read_pair_ins.mappinginfo.begin(); | |
661 for(; it!=read_pair_ins.mappinginfo.end();it++) | |
662 { | |
663 string eventid = it->first; | |
664 map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.find(eventid); | |
665 if(txdb_it == TXdb_entries.end() ) | |
666 { | |
667 continue; | |
668 } | |
669 //get numbers | |
670 for(int i=0; i<4;i++) | |
671 { | |
672 if(read_pair_ins.mappinginfo[eventid].Pos[i]!=-1) | |
673 { | |
674 int pos_tmp1 = read_pair_ins.mappinginfo[eventid].Pos[i]; | |
675 int pos_tmp2; | |
676 if(i<2) | |
677 { | |
678 if (txdb_it->second.strand == '-') | |
679 pos_tmp1 = txdb_it->second.len1 -pos_tmp1 +1- read_size; | |
680 pos_tmp2 = pos_tmp1 + read_size; | |
681 if (pos_tmp2 <= txdb_it->second.exonsize[0]) | |
682 { txdb_it->second.enum1++;continue;} | |
683 if (pos_tmp1 <= txdb_it->second.exonsize[0] ) | |
684 { | |
685 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] !=1) | |
686 { | |
687 txdb_it->second.jnum12++; | |
688 } | |
689 else | |
690 { | |
691 txdb_it->second.enum1++; | |
692 } | |
693 continue; | |
694 } | |
695 if (pos_tmp2 <= txdb_it->second.exonsize[0]+txdb_it->second.exonsize[1]) | |
696 { txdb_it->second.enum2++;continue;} | |
697 if (pos_tmp1 <= txdb_it->second.exonsize[0]+txdb_it->second.exonsize[1]) | |
698 { | |
699 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] !=1) | |
700 { | |
701 txdb_it->second.jnum23++; | |
702 } | |
703 else | |
704 { | |
705 txdb_it->second.enum2++; | |
706 } | |
707 continue; | |
708 | |
709 } | |
710 txdb_it->second.enum3++; | |
711 } | |
712 else | |
713 { | |
714 if (read_pair_ins.mappinginfo[eventid].IsoFlag[0] == 1 | |
715 && read_pair_ins.mappinginfo[eventid].IsoFlag[1] == 1) | |
716 continue; | |
717 if (txdb_it->second.strand == '-') | |
718 pos_tmp1 = txdb_it->second.len2 -pos_tmp1 +1- read_size; | |
719 pos_tmp2 = pos_tmp1 + read_size; | |
720 if (pos_tmp2 <= txdb_it->second.exonsize[0]) | |
721 { txdb_it->second.enum1++;continue;} | |
722 if (pos_tmp1 <= txdb_it->second.exonsize[0] ) | |
723 { | |
724 if(read_pair_ins.mappinginfo[eventid].IsoFlag[0] !=1) | |
725 { | |
726 txdb_it->second.jnum13++; | |
727 } | |
728 else | |
729 { | |
730 txdb_it->second.enum1++; | |
731 } | |
732 continue; | |
733 } | |
734 txdb_it->second.enum3++; | |
735 | |
736 | |
737 } | |
738 } | |
739 } | |
740 | |
741 if(read_pair_ins.mappinginfo[eventid].isPaired) | |
742 { | |
743 total_pair_read_num += 2; | |
744 txdb_it->second.totalreadnum += 2; | |
745 (txdb_it->second).pairs.push_back(read_pair_ins.mappinginfo[eventid]); | |
746 } | |
747 else | |
748 { | |
749 total_single_read_num += 2; | |
750 txdb_it->second.totalreadnum += 1; | |
751 if(read_pair_ins.mappinginfo[eventid].IsoFlag[0] == 1) | |
752 { | |
753 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] == 0) | |
754 { | |
755 (txdb_it->second).snum1++; | |
756 } | |
757 else | |
758 { | |
759 (txdb_it->second).snum12++; | |
760 } | |
761 } | |
762 else | |
763 { | |
764 (txdb_it->second).snum2++; | |
765 } | |
766 } | |
767 } | |
768 | |
769 | |
770 } | |
771 | |
772 read1_nomt_file.close(); | |
773 read2_nomt_file.close(); | |
774 cout<<"Total read number:\t"<<total_read_num*2<<endl; | |
775 cout<<"Singled reads used:\t"<<total_single_read_num<<endl; | |
776 cout<<"Paired reads used:\t"<<total_pair_read_num<<endl; | |
777 | |
778 log_file<<"#total_read_num:\t"<<total_read_num*2<<endl; | |
779 log_file<<"#total_single_read_num:\t"<<total_single_read_num<<endl; | |
780 log_file<<"#total_pair_read_num:\t"<<total_pair_read_num<<endl; | |
781 | |
782 //Calculate ratios and output | |
783 cout<<"Now doing estimations and output to "<<ratio_file_name<<"\t"<<num_file_name<<endl; | |
784 ofstream ratio_file(ratio_file_name.c_str()); | |
785 ofstream num_file(num_file_name.c_str()); | |
786 map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.begin(); | |
787 for(;txdb_it!=TXdb_entries.end();txdb_it++) | |
788 { | |
789 string eventid = txdb_it->first; | |
790 //cout<< eventid <<endl; | |
791 txdb_it->second.estimate(FZM,IRM,read_size); | |
792 /* ratio_file<< eventid<<"\t" | |
793 <<setprecision(4)<<fixed | |
794 <<txdb_it->second.ir<<"\t" | |
795 <<txdb_it->second.bir<<"\t" | |
796 <<"NA\tNA\tNA\t" | |
797 <<txdb_it->second.jnum12<<"\t" | |
798 <<txdb_it->second.jnum23<<"\t" | |
799 <<txdb_it->second.jnum13<<"\t" | |
800 <<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\t" | |
801 <<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\t" | |
802 <<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\t" | |
803 <<txdb_it->second.exonsize[0]<<"\t" | |
804 <<txdb_it->second.exonsize[1]<<"\t" | |
805 <<txdb_it->second.exonsize[2]<<"\t" | |
806 <<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<endl; | |
807 */ | |
808 ratio_file<<eventid<<"\t" | |
809 <<setprecision(4)<<fixed | |
810 <<txdb_it->second.ir<<"\t" //1 | |
811 <<txdb_it->second.bir<<"\t" //2 | |
812 <<txdb_it->second.chrid<<"\t" //3 | |
813 <<txdb_it->second.start<<"," // | |
814 <<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\t" //4 | |
815 <<txdb_it->second.start+txdb_it->second.exonstarts[1]<<"," | |
816 <<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\t"//5 | |
817 <<txdb_it->second.start+txdb_it->second.exonstarts[2]<<"," | |
818 <<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\t"//6 | |
819 <<txdb_it->second.strand<<"\t"//7 | |
820 <<txdb_it->second.jnum12<<"\t"//8 | |
821 <<txdb_it->second.jnum23<<"\t"//9 | |
822 <<txdb_it->second.jnum13<<"\t"//10 | |
823 <<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\t"//11 | |
824 <<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\t"//12 | |
825 <<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\t"//13` | |
826 <<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<"\t"//14 | |
827 <<txdb_it->second.exonsize[0]<<"\t"//15 | |
828 <<txdb_it->second.exonsize[1]<<"\t"//16 | |
829 <<txdb_it->second.exonsize[2]<<endl;//17 | |
830 | |
831 num_file<< eventid<<"\t" | |
832 <<txdb_it->second.chrid<<"\t" | |
833 <<txdb_it->second.start<<"," | |
834 <<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\t" | |
835 <<txdb_it->second.start+txdb_it->second.exonstarts[1]<<"," | |
836 <<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\t" | |
837 <<txdb_it->second.start+txdb_it->second.exonstarts[2]<<"," | |
838 <<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\t" | |
839 <<txdb_it->second.strand<<"\t" | |
840 <<txdb_it->second.enum1<<"\t" | |
841 <<txdb_it->second.enum2<<"\t" | |
842 <<txdb_it->second.enum3<<"\t" | |
843 <<txdb_it->second.jnum12<<"\t" | |
844 <<txdb_it->second.jnum23<<"\t" | |
845 <<txdb_it->second.jnum13<<endl; | |
846 } | |
847 log_file.close(); | |
848 ratio_file.close(); | |
849 num_file.close(); | |
850 clock_finish = clock(); | |
851 cout<<"Done! time used:"<<(double)(clock_finish-clock_start)/CLOCKS_PER_SEC<<" seconds"<<endl; | |
852 | |
853 | |
854 } |