Mercurial > repos > bioitcore > splicetrap
comparison src/splicetrap.estimate.cpp @ 1:adc0f7765d85 draft
planemo upload
| author | bioitcore |
|---|---|
| date | Thu, 07 Sep 2017 15:06:58 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d4ca551ca300 | 1:adc0f7765d85 |
|---|---|
| 1 //Author: Jie Wu@CSHL | |
| 2 //TO replace the original Perl code | |
| 3 | |
| 4 #include <stdio.h> | |
| 5 #include <stdlib.h> | |
| 6 #include <iostream> | |
| 7 #include <fstream> | |
| 8 #include <string.h> | |
| 9 #include <unistd.h> | |
| 10 #include <sstream> | |
| 11 #include <map> | |
| 12 #include <vector> | |
| 13 #include <math.h> | |
| 14 #include <time.h> | |
| 15 #include <limits> | |
| 16 #include <iomanip> | |
| 17 | |
| 18 using namespace std; | |
| 19 | |
| 20 int MAX_LINE_LEN = 1024*1024; | |
| 21 | |
| 22 void printusage() | |
| 23 { | |
| 24 cout<< "\tUsage:"<<endl; | |
| 25 cout<<"\t-s\tread_size"<<endl; | |
| 26 cout<<"\t-b\tIRM file"<<endl; | |
| 27 cout<<"\t-f\tFZM file"<<endl; | |
| 28 cout<<"\t-1\tmapping result 1"<<endl; | |
| 29 cout<<"\t-2\tmapping result 2"<<endl; | |
| 30 cout<<"\t-d\tTXdb database file"<<endl; | |
| 31 cout<<"\t-o\toutput prefix"<<endl; | |
| 32 | |
| 33 } | |
| 34 | |
| 35 struct ReadPairInfo | |
| 36 { | |
| 37 bool isPaired; | |
| 38 vector<int> IsoFlag; //flag as 1 if mapped to the isoform | |
| 39 vector<int> Size; //fragment size in the isoform | |
| 40 vector<int> Pos; | |
| 41 }; | |
| 42 | |
| 43 class TXdb_entry{ | |
| 44 public: | |
| 45 string chrid; | |
| 46 //int start; | |
| 47 //int end; | |
| 48 string id; | |
| 49 bool iscomplete; | |
| 50 int start; | |
| 51 int end; | |
| 52 int exonstarts[3]; | |
| 53 char strand; | |
| 54 int exonsize[3]; | |
| 55 int len1; | |
| 56 int len2; | |
| 57 vector< ReadPairInfo > pairs; | |
| 58 int snum1; //single-end reads | |
| 59 int snum2; | |
| 60 int snum12; | |
| 61 float ir, bir; | |
| 62 int enum1; //exon body reads | |
| 63 int enum2; | |
| 64 int enum3; | |
| 65 | |
| 66 int jnum12; | |
| 67 int jnum23; | |
| 68 int jnum13; | |
| 69 | |
| 70 int totalreadnum; | |
| 71 void estimate(vector<double> &FZM, map<string, vector<double> > &IRM, int &read_size) | |
| 72 { | |
| 73 float Maxe=0; | |
| 74 float Max = -numeric_limits<float>::max(); | |
| 75 float BMaxe=0; | |
| 76 float BMax=-numeric_limits<float>::max(); | |
| 77 for(float e1=0.001; e1<1;e1=e1+0.001) | |
| 78 { | |
| 79 float efix1 = e1*len1/ | |
| 80 (e1*exonsize[1]+len2); | |
| 81 float efix2 = 1-efix1; | |
| 82 float LL = snum12*log( efix1/(len1-read_size+1) | |
| 83 + efix2/(len2-read_size+1) ) | |
| 84 + snum1*log(efix1/(len1-read_size+1)) | |
| 85 + snum2*log(efix2/(len2-read_size+1)); | |
| 86 //cout<<LL<<endl; | |
| 87 //cout<<efix1<<"\t" <<pairs.size()<<"\t" <<len2<<"\t" <<endl; | |
| 88 //int num1=0, num2=0,num12=0; | |
| 89 for(int i=0;i<pairs.size();i++) | |
| 90 { | |
| 91 if(pairs[i].IsoFlag[0] == 1) | |
| 92 { | |
| 93 if(pairs[i].IsoFlag[1] == 0) | |
| 94 { | |
| 95 // num1++; | |
| 96 float tmp=FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1); | |
| 97 if (tmp == 0) | |
| 98 { | |
| 99 LL = LL - 308; | |
| 100 } | |
| 101 else | |
| 102 { | |
| 103 LL = LL + log(tmp); | |
| 104 } | |
| 105 } | |
| 106 else | |
| 107 { | |
| 108 // num12++; | |
| 109 float tmp=(FZM[pairs[i].Size[0]]*efix1/(len1-pairs[i].Size[0]+1)) | |
| 110 + (FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1)); | |
| 111 if(tmp ==0) | |
| 112 { | |
| 113 LL = LL -308; | |
| 114 } | |
| 115 else | |
| 116 { | |
| 117 LL = LL + log(tmp); | |
| 118 } | |
| 119 } | |
| 120 } | |
| 121 else | |
| 122 { | |
| 123 //num2++; | |
| 124 float tmp = FZM[pairs[i].Size[1]]*efix2/(len2-pairs[i].Size[1]+1); | |
| 125 if(tmp ==0) | |
| 126 { | |
| 127 LL= LL - 308; | |
| 128 } | |
| 129 else | |
| 130 { | |
| 131 LL = LL +log(tmp); | |
| 132 } | |
| 133 } | |
| 134 } | |
| 135 //cout<<num1<<"\t"<<num2<<"\t"<<num12<<"\n"; | |
| 136 //cout<<LL<<endl; | |
| 137 | |
| 138 if(IRM["CA"].size() > 0) | |
| 139 { | |
| 140 float BLL; | |
| 141 string eventtype =id.substr(0,2); | |
| 142 if(!eventtype.compare("ME")) | |
| 143 { | |
| 144 eventtype = "CA"; | |
| 145 } | |
| 146 BLL = LL + log(IRM[eventtype][int(e1/0.001)]); | |
| 147 if(BLL > BMax) | |
| 148 { | |
| 149 BMax = BLL; | |
| 150 BMaxe = e1; | |
| 151 } | |
| 152 } | |
| 153 //cout<<LL<<endl; | |
| 154 if(LL > Max) | |
| 155 { | |
| 156 Max= LL; | |
| 157 Maxe = e1; | |
| 158 | |
| 159 } | |
| 160 | |
| 161 } | |
| 162 ir= Maxe; | |
| 163 bir= BMaxe; | |
| 164 } | |
| 165 | |
| 166 TXdb_entry(char *line) | |
| 167 { | |
| 168 size_t found; | |
| 169 string linestr(line); | |
| 170 found = linestr.find("[L]"); | |
| 171 if(found != string::npos) | |
| 172 { | |
| 173 iscomplete=1; | |
| 174 | |
| 175 char *token; | |
| 176 | |
| 177 token = strtok(line,"\t"); | |
| 178 if(token==NULL) | |
| 179 { | |
| 180 cout<<"error1"<<endl; | |
| 181 exit(0); | |
| 182 } | |
| 183 else | |
| 184 { | |
| 185 chrid = token; | |
| 186 } | |
| 187 token = strtok(NULL,"\t"); | |
| 188 if(token==NULL) | |
| 189 { | |
| 190 cout<<"error2"<<endl; | |
| 191 exit(0); | |
| 192 } | |
| 193 else | |
| 194 { | |
| 195 sscanf(token,"%d",&start); | |
| 196 } | |
| 197 token = strtok(NULL,"\t"); | |
| 198 if(token==NULL) | |
| 199 { | |
| 200 cout<<"error3"<<endl; | |
| 201 exit(0); | |
| 202 } | |
| 203 else | |
| 204 { | |
| 205 sscanf(token,"%d",&end); | |
| 206 } | |
| 207 token = strtok(NULL,"\t"); | |
| 208 if(token==NULL) | |
| 209 { | |
| 210 cout<<"error4"<<endl;exit(0); | |
| 211 } | |
| 212 else | |
| 213 { | |
| 214 id = token; | |
| 215 id = id.substr(0,id.size()-3); | |
| 216 } | |
| 217 token = strtok(NULL,"\t"); | |
| 218 token = strtok(NULL,"\t"); | |
| 219 if(token==NULL) | |
| 220 { | |
| 221 cout<<"error5"<<endl;exit(0); | |
| 222 } | |
| 223 else | |
| 224 { | |
| 225 sscanf(token,"%c",&strand); | |
| 226 } | |
| 227 for(int i=0;i<5;i++) | |
| 228 { | |
| 229 token=strtok(NULL,"\t"); | |
| 230 } | |
| 231 if(token==NULL){ | |
| 232 cout<<"error6"<<endl;exit(0); | |
| 233 } | |
| 234 else | |
| 235 { | |
| 236 string str_tmp(token); | |
| 237 size_t found_tmp; | |
| 238 int *pexonsize = exonsize; | |
| 239 while((found_tmp = str_tmp.find(",")) != string::npos){ | |
| 240 sscanf((str_tmp.substr(0,found_tmp)).c_str(),"%d",pexonsize); | |
| 241 pexonsize++; | |
| 242 str_tmp.erase(0,found_tmp+1); | |
| 243 } | |
| 244 sscanf(str_tmp.c_str(),"%d",pexonsize); | |
| 245 | |
| 246 } | |
| 247 token=strtok(NULL,"\t"); | |
| 248 | |
| 249 if(token==NULL){ | |
| 250 cout<<"error7"<<endl;exit(0); | |
| 251 } | |
| 252 else | |
| 253 { | |
| 254 string str_tmp(token); | |
| 255 size_t found_tmp; | |
| 256 int *pexonstarts = exonstarts; | |
| 257 while((found_tmp = str_tmp.find(",")) != string::npos){ | |
| 258 sscanf((str_tmp.substr(0,found_tmp)).c_str(),"%d",pexonstarts); | |
| 259 pexonstarts++; | |
| 260 str_tmp.erase(0,found_tmp+1); | |
| 261 } | |
| 262 sscanf(str_tmp.c_str(),"%d",pexonstarts); | |
| 263 | |
| 264 } | |
| 265 len1 = exonsize[0]+exonsize[1]+exonsize[2]; | |
| 266 len2 = exonsize[0]+exonsize[2]; | |
| 267 snum12=0; | |
| 268 snum1 =0; | |
| 269 snum2 =0; | |
| 270 enum1=0; | |
| 271 enum2=0; | |
| 272 enum3=0; | |
| 273 jnum12=0; | |
| 274 jnum13=0; | |
| 275 jnum23=0; | |
| 276 totalreadnum=0; | |
| 277 } | |
| 278 else | |
| 279 { | |
| 280 iscomplete = 0; | |
| 281 } | |
| 282 } | |
| 283 }; | |
| 284 | |
| 285 class ReadInfo | |
| 286 { | |
| 287 public: | |
| 288 bool isUseful; | |
| 289 char *pline; | |
| 290 void scan(map<string, vector<int> > &map_pos ) | |
| 291 //eventname, pos in L/S iso | |
| 292 { | |
| 293 char *token; | |
| 294 token = strtok(pline, "\t"); | |
| 295 token = strtok(NULL, "\t"); | |
| 296 token = strtok(NULL, "\t"); | |
| 297 token = strtok(NULL, "\t"); | |
| 298 string mapinfo(token); | |
| 299 size_t beg, end; | |
| 300 end = 0; | |
| 301 while( (beg= mapinfo.find_first_of("/", end))!=string::npos) | |
| 302 { | |
| 303 beg ++; | |
| 304 end = mapinfo.find_first_of("[",beg); | |
| 305 string eventid = mapinfo.substr( beg, end-beg ) ; | |
| 306 vector<int> pos(2,-1); | |
| 307 if(!map_pos.count(eventid)) | |
| 308 { | |
| 309 map_pos.insert(pair<string, vector<int> > | |
| 310 (eventid, pos) ); | |
| 311 } | |
| 312 end ++; | |
| 313 string LorS = mapinfo.substr( end,1 ); | |
| 314 beg = mapinfo.find_first_of(":",end ); | |
| 315 beg ++; | |
| 316 end = mapinfo.find_first_of(",",beg ); | |
| 317 string pos_str(mapinfo.substr(beg, end-2-beg)); | |
| 318 | |
| 319 if(LorS.compare("L") == 0) | |
| 320 { | |
| 321 sscanf(pos_str.c_str(),"%d",&(map_pos[eventid][0]) ); | |
| 322 } | |
| 323 else | |
| 324 { | |
| 325 sscanf(pos_str.c_str(),"%d",&(map_pos[eventid][1]) ); | |
| 326 } | |
| 327 | |
| 328 //cout<<eventid<<"\t"<<map_pos[eventid][0]<<"\t"<<map_pos[eventid][1]<<endl; | |
| 329 } | |
| 330 | |
| 331 } | |
| 332 ReadInfo(char *line) | |
| 333 { | |
| 334 string line_str(line); | |
| 335 string line_str_last2=line_str.substr(line_str.size()-2,2); | |
| 336 if(line_str_last2.compare("NM") == 0 ) | |
| 337 { | |
| 338 isUseful = 0; | |
| 339 } | |
| 340 else if(line_str_last2.compare("MT") == 0 ) | |
| 341 { | |
| 342 isUseful = 0; | |
| 343 } | |
| 344 else | |
| 345 { | |
| 346 isUseful = 1; | |
| 347 pline = line; | |
| 348 } | |
| 349 } | |
| 350 }; | |
| 351 | |
| 352 | |
| 353 | |
| 354 class ReadPair | |
| 355 { | |
| 356 public: | |
| 357 map<string, ReadPairInfo> mappinginfo; | |
| 358 ReadPair(map<string, vector<int> > &read1_mappos, map<string, vector<int> > &read2_mappos, int &read_size) | |
| 359 { | |
| 360 map<string, vector<int> >::iterator it1 = read1_mappos.begin(); | |
| 361 for(;it1!=read1_mappos.end();it1++) | |
| 362 { | |
| 363 ReadPairInfo ReadPairInfo_tmp; | |
| 364 ReadPairInfo_tmp.IsoFlag.resize(2); | |
| 365 ReadPairInfo_tmp.Size.resize(2); | |
| 366 //ReadPairInfo_tmp.Pos.resize(4); | |
| 367 ReadPairInfo_tmp.Pos.assign(4,-1); | |
| 368 string eventid = it1->first; | |
| 369 if(read2_mappos.count(eventid) > 0) | |
| 370 { | |
| 371 ReadPairInfo_tmp.isPaired = 0; | |
| 372 for(int i=0;i<2;i++) | |
| 373 { | |
| 374 //cout<<read1_mappos[eventid][i]<<endl; | |
| 375 if(read2_mappos[eventid][i] == -1 || read1_mappos[eventid][i] == -1) | |
| 376 { | |
| 377 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
| 378 ReadPairInfo_tmp.Size[i] = 0; | |
| 379 //discard reads like this | |
| 380 } | |
| 381 else | |
| 382 { | |
| 383 | |
| 384 ReadPairInfo_tmp.isPaired = 1; | |
| 385 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
| 386 ReadPairInfo_tmp.Size[i] = abs(read2_mappos[eventid][i] | |
| 387 -read1_mappos[eventid][i]) | |
| 388 +read_size; | |
| 389 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i]; | |
| 390 ReadPairInfo_tmp.Pos[2*i+1] = read2_mappos[eventid][i]; | |
| 391 } | |
| 392 } | |
| 393 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
| 394 } | |
| 395 else | |
| 396 { | |
| 397 ReadPairInfo_tmp.isPaired = 0; | |
| 398 for(int i=0;i<2;i++) | |
| 399 { | |
| 400 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i]; | |
| 401 ReadPairInfo_tmp.Size[i] = 0; | |
| 402 if(read1_mappos[eventid][i] == -1) | |
| 403 { | |
| 404 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
| 405 | |
| 406 } | |
| 407 else | |
| 408 { | |
| 409 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
| 410 } | |
| 411 } | |
| 412 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
| 413 } | |
| 414 } | |
| 415 map<string, vector<int> >::iterator it2 = read2_mappos.begin(); | |
| 416 for(;it2!=read2_mappos.end();it2++) | |
| 417 { | |
| 418 string eventid = it2->first; | |
| 419 if(mappinginfo.count(eventid)>0) | |
| 420 { | |
| 421 continue; | |
| 422 } | |
| 423 ReadPairInfo ReadPairInfo_tmp; | |
| 424 ReadPairInfo_tmp.IsoFlag.resize(2); | |
| 425 ReadPairInfo_tmp.Size.resize(2); | |
| 426 //ReadPairInfo_tmp.Pos.resize(4); | |
| 427 ReadPairInfo_tmp.Pos.assign(4,-1); | |
| 428 if(read1_mappos.count(eventid) > 0) | |
| 429 { | |
| 430 ReadPairInfo_tmp.isPaired = 0; | |
| 431 for(int i=0;i<2;i++) | |
| 432 { | |
| 433 if(read2_mappos[eventid][i] == -1 || read1_mappos[eventid][i] == -1) | |
| 434 { | |
| 435 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
| 436 ReadPairInfo_tmp.Size[i] = 0; | |
| 437 } | |
| 438 else | |
| 439 { | |
| 440 ReadPairInfo_tmp.isPaired = 1; | |
| 441 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
| 442 ReadPairInfo_tmp.Size[i] = abs(read2_mappos[eventid][i] | |
| 443 -read1_mappos[eventid][i] | |
| 444 +read_size); | |
| 445 ReadPairInfo_tmp.Pos[2*i] = read1_mappos[eventid][i]; | |
| 446 ReadPairInfo_tmp.Pos[2*i+1]=read2_mappos[eventid][i]; | |
| 447 } | |
| 448 } | |
| 449 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
| 450 } | |
| 451 else | |
| 452 { | |
| 453 ReadPairInfo_tmp.isPaired = 0; | |
| 454 for(int i=0;i<2;i++) | |
| 455 { | |
| 456 ReadPairInfo_tmp.Pos[2*i+1]=read2_mappos[eventid][i]; | |
| 457 ReadPairInfo_tmp.Size[i] = 0; | |
| 458 if(read2_mappos[eventid][i] == -1) | |
| 459 { | |
| 460 ReadPairInfo_tmp.IsoFlag[i] = 0; | |
| 461 } | |
| 462 else | |
| 463 { | |
| 464 ReadPairInfo_tmp.IsoFlag[i] = 1; | |
| 465 } | |
| 466 } | |
| 467 mappinginfo.insert (pair<string, ReadPairInfo>(eventid, ReadPairInfo_tmp)); | |
| 468 } | |
| 469 | |
| 470 } | |
| 471 } | |
| 472 }; | |
| 473 | |
| 474 int main (int argc, char **argv) { | |
| 475 int next_option; | |
| 476 int digit_optind = 0; | |
| 477 int read_size = 36; | |
| 478 char *dbfile_name = 0, | |
| 479 *IRM_file_name = 0, | |
| 480 *FZM_file_name = 0, | |
| 481 *output_file_prefix = 0, | |
| 482 *read1_nomt_file_name = 0, | |
| 483 *read2_nomt_file_name = 0; | |
| 484 | |
| 485 while ( (next_option = getopt(argc, argv, "s:d:b:f:o:1:2:")) != -1) { | |
| 486 int this_option_optind = optind ? optind : 1; | |
| 487 switch (next_option) { | |
| 488 case 's': sscanf (optarg,"%d",&read_size); break; | |
| 489 case 'd': dbfile_name = optarg; break; | |
| 490 case 'b': IRM_file_name = optarg; break; | |
| 491 case 'f': FZM_file_name = optarg; break; | |
| 492 case 'o': output_file_prefix = optarg; break; | |
| 493 case '1': read1_nomt_file_name = optarg; break; | |
| 494 case '2': read2_nomt_file_name = optarg; break; | |
| 495 default: break; | |
| 496 } | |
| 497 } | |
| 498 if (!read_size|| | |
| 499 !dbfile_name|| | |
| 500 !FZM_file_name|| | |
| 501 !output_file_prefix|| | |
| 502 !read1_nomt_file_name|| | |
| 503 !read2_nomt_file_name) | |
| 504 { | |
| 505 printusage(); | |
| 506 return 0; | |
| 507 } | |
| 508 clock_t clock_start, clock_finish; | |
| 509 clock_start=clock(); | |
| 510 string ratio_file_name = output_file_prefix; | |
| 511 ratio_file_name.append(".ratio"); | |
| 512 string log_file_name = output_file_prefix; | |
| 513 log_file_name.append(".log"); | |
| 514 string num_file_name = output_file_prefix; | |
| 515 num_file_name.append(".nums"); | |
| 516 | |
| 517 ofstream log_file(log_file_name.c_str()); | |
| 518 cout<<"Log file name:\t"<<log_file_name<<endl; | |
| 519 cout<<"Parameters:"<<endl; | |
| 520 cout<<"Read size:\t"<<read_size<<endl; | |
| 521 log_file<<"#read_size:\t"<<read_size<<endl; | |
| 522 cout<<"Database:\t"<<dbfile_name<<endl; | |
| 523 log_file<<"#dbfile_name:\t"<<dbfile_name<<endl; | |
| 524 cout<<"Fragment size file name:\t"<<FZM_file_name<<endl; | |
| 525 log_file<<"#FZM_file_name:\t"<<FZM_file_name<<endl; | |
| 526 cout<<"Output file prefix:\t"<<output_file_prefix<<endl; | |
| 527 cout<<"Read end1 file:\t"<<read1_nomt_file_name<<endl; | |
| 528 cout<<"Read end2 file:\t"<<read2_nomt_file_name<<endl; | |
| 529 log_file<<"#read1_nomt_file_name:\t"<<read1_nomt_file_name<<endl; | |
| 530 log_file<<"#read2_nomt_file_name:\t"<<read2_nomt_file_name<<endl; | |
| 531 if(IRM_file_name) | |
| 532 { | |
| 533 cout<<"Inclusion ratio distn. file:\t"<<IRM_file_name<<endl; | |
| 534 log_file<<"#IRM_file_name:\t"<<IRM_file_name<<endl; | |
| 535 | |
| 536 } | |
| 537 else | |
| 538 { | |
| 539 cout<<"Inclusion ratio distn. file:\tNA"<<endl; | |
| 540 log_file<<"#IRM_file_name:\tNA"<<endl; | |
| 541 } | |
| 542 | |
| 543 //start reading files, dbfile first | |
| 544 cout<<"Reading database file..."<<endl; | |
| 545 map<string, TXdb_entry> TXdb_entries; | |
| 546 ifstream dbfile(dbfile_name); | |
| 547 char dbfile_line[MAX_LINE_LEN]; | |
| 548 while(dbfile.getline(dbfile_line,MAX_LINE_LEN)){ | |
| 549 TXdb_entry entry_tmp(dbfile_line); | |
| 550 if(entry_tmp.iscomplete) | |
| 551 { | |
| 552 TXdb_entries.insert(pair<string, TXdb_entry> | |
| 553 (entry_tmp.id,entry_tmp) | |
| 554 ); | |
| 555 //cout<<entry_tmp.exonsize[1]<<endl; | |
| 556 } | |
| 557 | |
| 558 } | |
| 559 dbfile.close(); | |
| 560 | |
| 561 //start reading bayes prior if there is | |
| 562 map<string, vector< double > > IRM; | |
| 563 IRM.insert(pair<string, vector<double> > ("CA", vector<double>())); | |
| 564 IRM.insert(pair<string, vector<double> > ("CS", vector<double>())); | |
| 565 IRM.insert(pair<string, vector<double> > ("IR", vector<double>())); | |
| 566 IRM.insert(pair<string, vector<double> > ("AA", vector<double>())); | |
| 567 IRM.insert(pair<string, vector<double> > ("AD", vector<double>())); | |
| 568 //IRM.insert(pair<string, vector<double> > ("AI", vector<double>())); | |
| 569 if(IRM_file_name) | |
| 570 { | |
| 571 cout<<"Reading inclusion ratio distn. file..."<<endl; | |
| 572 ifstream IRM_file(IRM_file_name); | |
| 573 char IRM_line[MAX_LINE_LEN]; | |
| 574 IRM_file.getline(IRM_line, MAX_LINE_LEN); | |
| 575 while(IRM_file.getline(IRM_line, MAX_LINE_LEN)) | |
| 576 { | |
| 577 char *token; | |
| 578 token = strtok(IRM_line, "\t"); | |
| 579 if (token != NULL) { | |
| 580 double hist_tmp; | |
| 581 sscanf(token,"%lf",&hist_tmp); | |
| 582 IRM["CA"].push_back(hist_tmp); | |
| 583 } | |
| 584 token = strtok(NULL,"\t"); | |
| 585 if (token != NULL) { | |
| 586 double hist_tmp; | |
| 587 sscanf(token,"%lf",&hist_tmp); | |
| 588 IRM["AD"].push_back(hist_tmp); | |
| 589 } | |
| 590 token = strtok(NULL,"\t"); | |
| 591 if (token != NULL) { | |
| 592 double hist_tmp; | |
| 593 sscanf(token,"%lf",&hist_tmp); | |
| 594 IRM["AA"].push_back(hist_tmp); | |
| 595 } | |
| 596 /* token = strtok(NULL,"\t"); | |
| 597 if (token != NULL) { | |
| 598 double hist_tmp; | |
| 599 sscanf(token,"%lf",&hist_tmp); | |
| 600 IRM["AI"].push_back(hist_tmp); | |
| 601 }*/ | |
| 602 token = strtok(NULL,"\t"); | |
| 603 if (token != NULL) { | |
| 604 double hist_tmp; | |
| 605 sscanf(token,"%lf",&hist_tmp); | |
| 606 IRM["IR"].push_back(hist_tmp); | |
| 607 } | |
| 608 token = strtok(NULL,"\t"); | |
| 609 if (token != NULL) { | |
| 610 double hist_tmp; | |
| 611 sscanf(token,"%lf",&hist_tmp); | |
| 612 IRM["CS"].push_back(hist_tmp); | |
| 613 } | |
| 614 | |
| 615 } | |
| 616 IRM_file.close(); | |
| 617 } | |
| 618 | |
| 619 //read FZM | |
| 620 cout<<"Reading fragment size distn. file..."<<endl; | |
| 621 ifstream FZM_file(FZM_file_name); | |
| 622 vector<double> FZM; | |
| 623 char FZM_file_line[MAX_LINE_LEN]; | |
| 624 FZM_file.getline(FZM_file_line,MAX_LINE_LEN); | |
| 625 while(FZM_file.getline (FZM_file_line,MAX_LINE_LEN) ) | |
| 626 { | |
| 627 double hist_tmp; | |
| 628 sscanf(FZM_file_line, "%lf", &hist_tmp) ; | |
| 629 FZM.push_back(hist_tmp); | |
| 630 } | |
| 631 FZM_file.close(); | |
| 632 //start to read reads | |
| 633 cout<<"Reading reads from both ends...."<<endl; | |
| 634 ifstream read1_nomt_file(read1_nomt_file_name); | |
| 635 ifstream read2_nomt_file(read2_nomt_file_name); | |
| 636 char read1_line[MAX_LINE_LEN]; | |
| 637 char read2_line[MAX_LINE_LEN]; | |
| 638 int total_single_read_num=0; | |
| 639 int total_pair_read_num=0; | |
| 640 int total_read_num=0; | |
| 641 while(read1_nomt_file.getline(read1_line, MAX_LINE_LEN)) | |
| 642 { | |
| 643 total_read_num++; | |
| 644 ReadInfo read1(read1_line); | |
| 645 map<string, vector<int> > read1_mappos; | |
| 646 if(read1.isUseful) | |
| 647 { | |
| 648 read1.scan(read1_mappos); | |
| 649 } | |
| 650 read2_nomt_file.getline(read2_line, MAX_LINE_LEN); | |
| 651 ReadInfo read2(read2_line); | |
| 652 map<string, vector<int> > read2_mappos; | |
| 653 if(read2.isUseful) | |
| 654 { | |
| 655 read2.scan(read2_mappos); | |
| 656 } | |
| 657 //pairing | |
| 658 ReadPair read_pair_ins (read1_mappos, read2_mappos, read_size); | |
| 659 //feed them into TXdb_entries | |
| 660 map<string , ReadPairInfo>::iterator it = read_pair_ins.mappinginfo.begin(); | |
| 661 for(; it!=read_pair_ins.mappinginfo.end();it++) | |
| 662 { | |
| 663 string eventid = it->first; | |
| 664 map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.find(eventid); | |
| 665 if(txdb_it == TXdb_entries.end() ) | |
| 666 { | |
| 667 continue; | |
| 668 } | |
| 669 //get numbers | |
| 670 for(int i=0; i<4;i++) | |
| 671 { | |
| 672 if(read_pair_ins.mappinginfo[eventid].Pos[i]!=-1) | |
| 673 { | |
| 674 int pos_tmp1 = read_pair_ins.mappinginfo[eventid].Pos[i]; | |
| 675 int pos_tmp2; | |
| 676 if(i<2) | |
| 677 { | |
| 678 if (txdb_it->second.strand == '-') | |
| 679 pos_tmp1 = txdb_it->second.len1 -pos_tmp1 +1- read_size; | |
| 680 pos_tmp2 = pos_tmp1 + read_size; | |
| 681 if (pos_tmp2 <= txdb_it->second.exonsize[0]) | |
| 682 { txdb_it->second.enum1++;continue;} | |
| 683 if (pos_tmp1 <= txdb_it->second.exonsize[0] ) | |
| 684 { | |
| 685 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] !=1) | |
| 686 { | |
| 687 txdb_it->second.jnum12++; | |
| 688 } | |
| 689 else | |
| 690 { | |
| 691 txdb_it->second.enum1++; | |
| 692 } | |
| 693 continue; | |
| 694 } | |
| 695 if (pos_tmp2 <= txdb_it->second.exonsize[0]+txdb_it->second.exonsize[1]) | |
| 696 { txdb_it->second.enum2++;continue;} | |
| 697 if (pos_tmp1 <= txdb_it->second.exonsize[0]+txdb_it->second.exonsize[1]) | |
| 698 { | |
| 699 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] !=1) | |
| 700 { | |
| 701 txdb_it->second.jnum23++; | |
| 702 } | |
| 703 else | |
| 704 { | |
| 705 txdb_it->second.enum2++; | |
| 706 } | |
| 707 continue; | |
| 708 | |
| 709 } | |
| 710 txdb_it->second.enum3++; | |
| 711 } | |
| 712 else | |
| 713 { | |
| 714 if (read_pair_ins.mappinginfo[eventid].IsoFlag[0] == 1 | |
| 715 && read_pair_ins.mappinginfo[eventid].IsoFlag[1] == 1) | |
| 716 continue; | |
| 717 if (txdb_it->second.strand == '-') | |
| 718 pos_tmp1 = txdb_it->second.len2 -pos_tmp1 +1- read_size; | |
| 719 pos_tmp2 = pos_tmp1 + read_size; | |
| 720 if (pos_tmp2 <= txdb_it->second.exonsize[0]) | |
| 721 { txdb_it->second.enum1++;continue;} | |
| 722 if (pos_tmp1 <= txdb_it->second.exonsize[0] ) | |
| 723 { | |
| 724 if(read_pair_ins.mappinginfo[eventid].IsoFlag[0] !=1) | |
| 725 { | |
| 726 txdb_it->second.jnum13++; | |
| 727 } | |
| 728 else | |
| 729 { | |
| 730 txdb_it->second.enum1++; | |
| 731 } | |
| 732 continue; | |
| 733 } | |
| 734 txdb_it->second.enum3++; | |
| 735 | |
| 736 | |
| 737 } | |
| 738 } | |
| 739 } | |
| 740 | |
| 741 if(read_pair_ins.mappinginfo[eventid].isPaired) | |
| 742 { | |
| 743 total_pair_read_num += 2; | |
| 744 txdb_it->second.totalreadnum += 2; | |
| 745 (txdb_it->second).pairs.push_back(read_pair_ins.mappinginfo[eventid]); | |
| 746 } | |
| 747 else | |
| 748 { | |
| 749 total_single_read_num += 2; | |
| 750 txdb_it->second.totalreadnum += 1; | |
| 751 if(read_pair_ins.mappinginfo[eventid].IsoFlag[0] == 1) | |
| 752 { | |
| 753 if(read_pair_ins.mappinginfo[eventid].IsoFlag[1] == 0) | |
| 754 { | |
| 755 (txdb_it->second).snum1++; | |
| 756 } | |
| 757 else | |
| 758 { | |
| 759 (txdb_it->second).snum12++; | |
| 760 } | |
| 761 } | |
| 762 else | |
| 763 { | |
| 764 (txdb_it->second).snum2++; | |
| 765 } | |
| 766 } | |
| 767 } | |
| 768 | |
| 769 | |
| 770 } | |
| 771 | |
| 772 read1_nomt_file.close(); | |
| 773 read2_nomt_file.close(); | |
| 774 cout<<"Total read number:\t"<<total_read_num*2<<endl; | |
| 775 cout<<"Singled reads used:\t"<<total_single_read_num<<endl; | |
| 776 cout<<"Paired reads used:\t"<<total_pair_read_num<<endl; | |
| 777 | |
| 778 log_file<<"#total_read_num:\t"<<total_read_num*2<<endl; | |
| 779 log_file<<"#total_single_read_num:\t"<<total_single_read_num<<endl; | |
| 780 log_file<<"#total_pair_read_num:\t"<<total_pair_read_num<<endl; | |
| 781 | |
| 782 //Calculate ratios and output | |
| 783 cout<<"Now doing estimations and output to "<<ratio_file_name<<"\t"<<num_file_name<<endl; | |
| 784 ofstream ratio_file(ratio_file_name.c_str()); | |
| 785 ofstream num_file(num_file_name.c_str()); | |
| 786 map<string, TXdb_entry>::iterator txdb_it = TXdb_entries.begin(); | |
| 787 for(;txdb_it!=TXdb_entries.end();txdb_it++) | |
| 788 { | |
| 789 string eventid = txdb_it->first; | |
| 790 //cout<< eventid <<endl; | |
| 791 txdb_it->second.estimate(FZM,IRM,read_size); | |
| 792 /* ratio_file<< eventid<<"\t" | |
| 793 <<setprecision(4)<<fixed | |
| 794 <<txdb_it->second.ir<<"\t" | |
| 795 <<txdb_it->second.bir<<"\t" | |
| 796 <<"NA\tNA\tNA\t" | |
| 797 <<txdb_it->second.jnum12<<"\t" | |
| 798 <<txdb_it->second.jnum23<<"\t" | |
| 799 <<txdb_it->second.jnum13<<"\t" | |
| 800 <<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\t" | |
| 801 <<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\t" | |
| 802 <<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\t" | |
| 803 <<txdb_it->second.exonsize[0]<<"\t" | |
| 804 <<txdb_it->second.exonsize[1]<<"\t" | |
| 805 <<txdb_it->second.exonsize[2]<<"\t" | |
| 806 <<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<endl; | |
| 807 */ | |
| 808 ratio_file<<eventid<<"\t" | |
| 809 <<setprecision(4)<<fixed | |
| 810 <<txdb_it->second.ir<<"\t" //1 | |
| 811 <<txdb_it->second.bir<<"\t" //2 | |
| 812 <<txdb_it->second.chrid<<"\t" //3 | |
| 813 <<txdb_it->second.start<<"," // | |
| 814 <<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\t" //4 | |
| 815 <<txdb_it->second.start+txdb_it->second.exonstarts[1]<<"," | |
| 816 <<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\t"//5 | |
| 817 <<txdb_it->second.start+txdb_it->second.exonstarts[2]<<"," | |
| 818 <<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\t"//6 | |
| 819 <<txdb_it->second.strand<<"\t"//7 | |
| 820 <<txdb_it->second.jnum12<<"\t"//8 | |
| 821 <<txdb_it->second.jnum23<<"\t"//9 | |
| 822 <<txdb_it->second.jnum13<<"\t"//10 | |
| 823 <<(1.0*read_size*( txdb_it->second.enum1+txdb_it->second.jnum12 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[0] )<<"\t"//11 | |
| 824 <<(1.0*read_size*( txdb_it->second.enum2+txdb_it->second.jnum12 +txdb_it->second.jnum23 )/txdb_it->second.exonsize[1] )<<"\t"//12 | |
| 825 <<(1.0*read_size*( txdb_it->second.enum3+txdb_it->second.jnum23 +txdb_it->second.jnum13 )/txdb_it->second.exonsize[2] )<<"\t"//13` | |
| 826 <<(1.0*read_size*txdb_it->second.totalreadnum/txdb_it->second.len1)<<"\t"//14 | |
| 827 <<txdb_it->second.exonsize[0]<<"\t"//15 | |
| 828 <<txdb_it->second.exonsize[1]<<"\t"//16 | |
| 829 <<txdb_it->second.exonsize[2]<<endl;//17 | |
| 830 | |
| 831 num_file<< eventid<<"\t" | |
| 832 <<txdb_it->second.chrid<<"\t" | |
| 833 <<txdb_it->second.start<<"," | |
| 834 <<txdb_it->second.start+txdb_it->second.exonsize[0]<<"\t" | |
| 835 <<txdb_it->second.start+txdb_it->second.exonstarts[1]<<"," | |
| 836 <<txdb_it->second.start+txdb_it->second.exonstarts[1]+txdb_it->second.exonsize[1]<<"\t" | |
| 837 <<txdb_it->second.start+txdb_it->second.exonstarts[2]<<"," | |
| 838 <<txdb_it->second.start+txdb_it->second.exonstarts[2]+txdb_it->second.exonsize[2]<<"\t" | |
| 839 <<txdb_it->second.strand<<"\t" | |
| 840 <<txdb_it->second.enum1<<"\t" | |
| 841 <<txdb_it->second.enum2<<"\t" | |
| 842 <<txdb_it->second.enum3<<"\t" | |
| 843 <<txdb_it->second.jnum12<<"\t" | |
| 844 <<txdb_it->second.jnum23<<"\t" | |
| 845 <<txdb_it->second.jnum13<<endl; | |
| 846 } | |
| 847 log_file.close(); | |
| 848 ratio_file.close(); | |
| 849 num_file.close(); | |
| 850 clock_finish = clock(); | |
| 851 cout<<"Done! time used:"<<(double)(clock_finish-clock_start)/CLOCKS_PER_SEC<<" seconds"<<endl; | |
| 852 | |
| 853 | |
| 854 } |
