Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/CombineSubexons.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdio.h> | |
2 #include <string.h> | |
3 | |
4 #include <algorithm> | |
5 #include <vector> | |
6 #include <map> | |
7 #include <string> | |
8 | |
9 #include "alignments.hpp" | |
10 #include "blocks.hpp" | |
11 #include "stats.hpp" | |
12 #include "SubexonGraph.hpp" | |
13 | |
14 char usage[] = "combineSubexons [options]\n" | |
15 "Required options:\n" | |
16 "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n" | |
17 "\t\tor\n" | |
18 "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ; | |
19 | |
20 struct _overhang | |
21 { | |
22 int cnt ; // the number of samples support this subexon. | |
23 int validCnt ; // The number of samples that are used for compute probability. | |
24 int length ; | |
25 double classifier ; | |
26 } ; | |
27 | |
28 struct _intronicInfo | |
29 { | |
30 int chrId ; | |
31 int start, end ; | |
32 int leftSubexonIdx, rightSubexonIdx ; | |
33 double irClassifier ; | |
34 int irCnt ; | |
35 int validIrCnt ; | |
36 struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron. | |
37 } ; | |
38 | |
39 struct _seInterval | |
40 { | |
41 int chrId ; | |
42 int start, end ; | |
43 int type ; // 0-subexon, 1-intronicInfo | |
44 int idx ; | |
45 } ; | |
46 | |
47 struct _subexonSplit | |
48 { | |
49 int chrId ; | |
50 int pos ; | |
51 int type ; //1-start of a subexon. 2-end of a subexon | |
52 int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. | |
53 int strand ; | |
54 | |
55 int weight ; | |
56 } ; | |
57 | |
58 struct _interval // exon or intron | |
59 { | |
60 int chrId ; | |
61 int start, end ; | |
62 int strand ; | |
63 int sampleSupport ; | |
64 } ; | |
65 | |
66 struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph. | |
67 { | |
68 int *nextSupport ; | |
69 int *prevSupport ; | |
70 } ; | |
71 | |
72 char buffer[4096] ; | |
73 | |
74 bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b ) | |
75 { | |
76 if ( a.chrId < b.chrId ) | |
77 return true ; | |
78 else if ( a.chrId > b.chrId ) | |
79 return false ; | |
80 else if ( a.pos != b.pos ) | |
81 return a.pos < b.pos ; | |
82 else if ( a.type != b.type ) | |
83 { | |
84 // the split site with no strand information should come first. | |
85 /*if ( a.splitType != b.splitType ) | |
86 { | |
87 if ( a.splitType == 0 ) | |
88 return true ; | |
89 else if ( b.splitType == 0 ) | |
90 return false ; | |
91 }*/ | |
92 return a.type < b.type ; | |
93 } | |
94 else if ( a.splitType != b.splitType ) | |
95 { | |
96 //return a.splitType < b.splitType ; | |
97 if ( a.splitType == 0 ) | |
98 return true ; | |
99 else if ( b.splitType == 0 ) | |
100 return false ; | |
101 | |
102 if ( a.type == 1 ) | |
103 return a.splitType > b.splitType ; | |
104 else | |
105 return a.splitType < b.splitType ; | |
106 } | |
107 | |
108 return false ; | |
109 } | |
110 | |
111 bool CompInterval( struct _interval a, struct _interval b ) | |
112 { | |
113 if ( a.chrId < b.chrId ) | |
114 return true ; | |
115 else if ( a.chrId > b.chrId ) | |
116 return false ; | |
117 else if ( a.start != b.start ) | |
118 return a.start < b.start ; | |
119 else if ( a.end != b.end ) | |
120 return a.end < b.end ; | |
121 return false ; | |
122 } | |
123 | |
124 bool CompSeInterval( struct _seInterval a, struct _seInterval b ) | |
125 { | |
126 if ( a.chrId < b.chrId ) | |
127 return true ; | |
128 else if ( a.chrId > b.chrId ) | |
129 return false ; | |
130 else if ( a.start < b.start ) | |
131 return true ; | |
132 else if ( a.start > b.start ) | |
133 return false ; | |
134 else if ( a.end < b.end ) | |
135 return true ; | |
136 else | |
137 return false ; | |
138 } | |
139 | |
140 // Keep this the same as in SubexonInfo.cpp. | |
141 double TransformCov( double c ) | |
142 { | |
143 double ret ; | |
144 //return sqrt( c ) - 1 ; | |
145 | |
146 if ( c <= 2 + 1e-6 ) | |
147 ret = 1e-6 ; | |
148 else | |
149 ret = c - 2 ; | |
150 | |
151 return ret ; | |
152 } | |
153 | |
154 double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2], | |
155 double piCov, double kCov[2], double thetaCov[2], bool conservative ) | |
156 { | |
157 double p1 = 0, p2 ; | |
158 | |
159 cov = TransformCov( cov ) ; | |
160 if ( cov < ( kCov[0] - 1 ) * thetaCov[0] ) | |
161 cov = ( kCov[0] - 1 ) * thetaCov[0] ; | |
162 | |
163 if ( ratio > 0 ) | |
164 p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ; | |
165 // Make sure cov > 1? | |
166 p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ; | |
167 double ret = 0 ; | |
168 | |
169 if ( conservative ) | |
170 { | |
171 if ( p1 >= p2 ) // we should use ratio. | |
172 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) | |
173 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; | |
174 else | |
175 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) | |
176 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; | |
177 } | |
178 else | |
179 { | |
180 if ( p1 >= p2 ) // we should use ratio. | |
181 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) | |
182 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; | |
183 else | |
184 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) | |
185 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; | |
186 } | |
187 return ret ; | |
188 } | |
189 | |
190 double GetPValueOfGeneEnd( double cov ) | |
191 { | |
192 if ( cov <= 2.0 ) | |
193 return 1.0 ; | |
194 double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ; | |
195 if ( tmp <= 0 ) | |
196 return 1.0 ; | |
197 return 2.0 * alnorm( tmp, true ) ; | |
198 } | |
199 | |
200 char StrandNumToSymbol( int strand ) | |
201 { | |
202 if ( strand > 0 ) | |
203 return '+' ; | |
204 else if ( strand < 0 ) | |
205 return '-' ; | |
206 else | |
207 return '.' ; | |
208 } | |
209 | |
210 int StrandSymbolToNum( char c ) | |
211 { | |
212 if ( c == '+' ) | |
213 return 1 ; | |
214 else if ( c == '-' ) | |
215 return -1 ; | |
216 else | |
217 return 0 ; | |
218 } | |
219 | |
220 int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support ) | |
221 { | |
222 int i, j, k ; | |
223 int *ret ; | |
224 if ( acnt == 0 ) | |
225 { | |
226 newCnt = ocnt ; | |
227 return old ; | |
228 } | |
229 if ( ocnt == 0 ) | |
230 { | |
231 newCnt = acnt ; | |
232 ret = new int[acnt] ; | |
233 //*support = new int[acnt] ; | |
234 for ( i = 0 ; i < acnt ; ++i ) | |
235 { | |
236 //(*support)[i] = 1 ; | |
237 ret[i] = add[i] ; | |
238 } | |
239 return ret ; | |
240 } | |
241 newCnt = 0 ; | |
242 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) | |
243 { | |
244 if ( old[i] < add[j] ) | |
245 { | |
246 ++i ; | |
247 ++newCnt ; | |
248 } | |
249 else if ( old[i] == add[j] ) | |
250 { | |
251 ++i ; ++j ; | |
252 ++newCnt ; | |
253 } | |
254 else | |
255 { | |
256 ++j ; | |
257 ++newCnt ; | |
258 } | |
259 } | |
260 newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ; | |
261 // no new elements. | |
262 if ( newCnt == ocnt ) | |
263 { | |
264 /*i = 0 ; | |
265 for ( j = 0 ; j < acnt ; ++j ) | |
266 { | |
267 for ( ; old[i] < add[j] ; ++i ) | |
268 ; | |
269 ++(*support)[i] ; | |
270 }*/ | |
271 return old ; | |
272 } | |
273 k = 0 ; | |
274 //delete []old ; | |
275 ret = new int[ newCnt ] ; | |
276 //int *bufferSupport = new int[newCnt] ; | |
277 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) | |
278 { | |
279 if ( old[i] < add[j] ) | |
280 { | |
281 ret[k] = old[i] ; | |
282 //bufferSupport[k] = (*support)[i] ; | |
283 ++i ; | |
284 ++k ; | |
285 } | |
286 else if ( old[i] == add[j] ) | |
287 { | |
288 ret[k] = old[i] ; | |
289 //bufferSupport[k] = (*support)[i] + 1 ; | |
290 ++i ; ++j ; | |
291 ++k ; | |
292 } | |
293 else | |
294 { | |
295 ret[k] = add[j] ; | |
296 //bufferSupport[k] = 1 ; | |
297 ++j ; | |
298 ++k ; | |
299 } | |
300 } | |
301 for ( ; i < ocnt ; ++i, ++k ) | |
302 { | |
303 ret[k] = old[i] ; | |
304 //bufferSupport[k] = (*support)[i] ; | |
305 } | |
306 for ( ; j < acnt ; ++j, ++k ) | |
307 { | |
308 ret[k] = add[j] ; | |
309 //bufferSupport[k] = 1 ; | |
310 } | |
311 delete[] old ; | |
312 //delete[] *support ; | |
313 //*support = bufferSupport ; | |
314 return ret ; | |
315 } | |
316 | |
317 void CoalesceSubexonSplits( std::vector<struct _subexonSplit> &splits, int mid ) | |
318 { | |
319 int i, j, k ; | |
320 int cnt = splits.size() ; | |
321 //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ; | |
322 | |
323 std::vector<struct _subexonSplit> sortedSplits ; | |
324 sortedSplits.resize( cnt ) ; | |
325 | |
326 k = 0 ; | |
327 for ( i = 0, j = mid ; i < mid && j < cnt ; ++k ) | |
328 { | |
329 if ( CompSubexonSplit( splits[i], splits[j] ) ) | |
330 { | |
331 sortedSplits[k] = splits[i] ; | |
332 ++i ; | |
333 } | |
334 else | |
335 { | |
336 sortedSplits[k] = splits[j] ; | |
337 ++j ; | |
338 } | |
339 } | |
340 for ( ; i < mid ; ++i, ++k ) | |
341 sortedSplits[k] = splits[i] ; | |
342 for ( ; j < cnt ; ++j, ++k ) | |
343 sortedSplits[k] = splits[j] ; | |
344 splits = sortedSplits ; | |
345 | |
346 k = 0 ; | |
347 for ( i = 1 ; i < cnt ; ++i ) | |
348 { | |
349 if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType | |
350 && splits[i].strand == splits[k].strand ) | |
351 { | |
352 splits[k].weight += splits[i].weight ; | |
353 } | |
354 else | |
355 { | |
356 ++k ; | |
357 splits[k] = splits[i] ; | |
358 } | |
359 } | |
360 splits.resize( k + 1 ) ; | |
361 } | |
362 | |
363 void CoalesceDifferentStrandSubexonSplits( std::vector<struct _subexonSplit> &splits ) | |
364 { | |
365 int i, j, k, l ; | |
366 int cnt = splits.size() ; | |
367 k = 0 ; | |
368 for ( i = 0 ; i < cnt ; ) | |
369 { | |
370 for ( j = i + 1 ; j < cnt ; ++j ) | |
371 { | |
372 if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType ) | |
373 continue ; | |
374 break ; | |
375 } | |
376 | |
377 int maxWeight = -1 ; | |
378 int weightSum = 0 ; | |
379 int strand = splits[i].strand ; | |
380 for ( l = i ; l < j ; ++l ) | |
381 { | |
382 weightSum += splits[l].weight ; | |
383 if ( splits[l].strand != 0 && splits[l].weight > maxWeight ) | |
384 { | |
385 strand = splits[l].strand ; | |
386 maxWeight = splits[l].weight ; | |
387 } | |
388 } | |
389 //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ; | |
390 splits[k] = splits[i] ; | |
391 splits[k].strand = strand ; | |
392 splits[k].weight = weightSum ; | |
393 ++k ; | |
394 | |
395 i = j ; | |
396 } | |
397 splits.resize( k ) ; | |
398 } | |
399 | |
400 | |
401 void CoalesceIntervals( std::vector<struct _interval> &intervals ) | |
402 { | |
403 int i, k ; | |
404 std::sort( intervals.begin(), intervals.end(), CompInterval ) ; | |
405 int cnt = intervals.size() ; | |
406 k = 0 ; | |
407 for ( i = 1 ; i < cnt ; ++i ) | |
408 { | |
409 if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end ) | |
410 intervals[k].sampleSupport += intervals[i].sampleSupport ; | |
411 else | |
412 { | |
413 ++k ; | |
414 intervals[k] = intervals[i] ; | |
415 } | |
416 } | |
417 intervals.resize( k + 1 ) ; | |
418 } | |
419 | |
420 void CleanIntervalIrOverhang( std::vector<struct _interval> &irOverhang ) | |
421 { | |
422 int i, j, k ; | |
423 std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ; | |
424 | |
425 int cnt = irOverhang.size() ; | |
426 for ( i = 0 ; i < cnt ; ++i ) | |
427 { | |
428 if ( irOverhang[i].start == -1 ) | |
429 continue ; | |
430 | |
431 | |
432 // locate the longest interval start at the same coordinate. | |
433 int tag = i ; | |
434 | |
435 for ( j = i + 1 ; j < cnt ; ++j ) | |
436 { | |
437 if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start ) | |
438 break ; | |
439 if ( irOverhang[j].start == -1 ) | |
440 continue ; | |
441 tag = j ; | |
442 } | |
443 | |
444 for ( k = i ; k < tag ; ++k ) | |
445 { | |
446 irOverhang[k].start = -1 ; | |
447 } | |
448 | |
449 for ( k = tag + 1 ; k < cnt ; ++k ) | |
450 { | |
451 if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end ) | |
452 break ; | |
453 if ( irOverhang[k].end <= irOverhang[tag].end ) | |
454 { | |
455 irOverhang[k].start = -1 ; | |
456 } | |
457 } | |
458 } | |
459 | |
460 k = 0 ; | |
461 for ( i = 0 ; i < cnt ; ++i ) | |
462 { | |
463 if ( irOverhang[i].start == -1 ) | |
464 continue ; | |
465 irOverhang[k] = irOverhang[i] ; | |
466 ++k ; | |
467 } | |
468 irOverhang.resize( k ) ; | |
469 } | |
470 | |
471 // Remove the connection that does not match the boundary | |
472 // of subexons. | |
473 void CleanUpSubexonConnections( std::vector<struct _subexon> &subexons ) | |
474 { | |
475 int seCnt = subexons.size() ; | |
476 int i, j, k, m ; | |
477 for ( i = 0 ; i < seCnt ; ++i ) | |
478 { | |
479 if ( subexons[i].prevCnt > 0 ) | |
480 { | |
481 for ( k = i ; k >= 0 ; --k ) | |
482 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] ) | |
483 break ; | |
484 if ( subexons[k].chrId != subexons[i].chrId ) | |
485 ++k ; | |
486 m = 0 ; | |
487 for ( j = 0 ; j < subexons[i].prevCnt ; ++j ) | |
488 { | |
489 for ( ; k <= i ; ++k ) | |
490 if ( subexons[k].end >= subexons[i].prev[j] ) | |
491 break ; | |
492 if ( subexons[k].end == subexons[i].prev[j] | |
493 && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) ) | |
494 { | |
495 subexons[i].prev[m] = subexons[i].prev[j] ; | |
496 ++m ; | |
497 } | |
498 } | |
499 subexons[i].prevCnt = m ; | |
500 } | |
501 | |
502 m = 0 ; | |
503 k = i ; | |
504 for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) | |
505 { | |
506 for ( ; k < seCnt ; ++k ) | |
507 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] ) | |
508 break ; | |
509 if ( subexons[k].start == subexons[i].next[j] | |
510 && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) ) | |
511 { | |
512 subexons[i].next[m] = subexons[i].next[j] ; | |
513 ++m ; | |
514 } | |
515 } | |
516 subexons[i].nextCnt = m ; | |
517 } | |
518 } | |
519 | |
520 int main( int argc, char *argv[] ) | |
521 { | |
522 int i, j, k ; | |
523 FILE *fp ; | |
524 std::vector<char *> files ; | |
525 | |
526 Blocks regions ; | |
527 Alignments alignments ; | |
528 | |
529 if ( argc == 1 ) | |
530 { | |
531 printf( "%s", usage ) ; | |
532 return 0 ; | |
533 } | |
534 | |
535 for ( i = 1 ; i < argc ; ++i ) | |
536 { | |
537 if ( !strcmp( argv[i], "-s" ) ) | |
538 { | |
539 files.push_back( argv[i + 1] ) ; | |
540 ++i ; | |
541 continue ; | |
542 } | |
543 else if ( !strcmp( argv[i], "--ls" ) ) | |
544 { | |
545 FILE *fpLs = fopen( argv[i + 1], "r" ) ; | |
546 char buffer[1024] ; | |
547 while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL ) | |
548 { | |
549 int len = strlen( buffer ) ; | |
550 if ( buffer[len - 1] == '\n' ) | |
551 { | |
552 buffer[len - 1] = '\0' ; | |
553 --len ; | |
554 | |
555 } | |
556 char *fileName = strdup( buffer ) ; | |
557 files.push_back( fileName ) ; | |
558 } | |
559 } | |
560 } | |
561 int fileCnt = files.size() ; | |
562 // Obtain the chromosome ids through bam file. | |
563 fp = fopen( files[0], "r" ) ; | |
564 if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) | |
565 { | |
566 int len = strlen( buffer ) ; | |
567 buffer[len - 1] = '\0' ; | |
568 alignments.Open( buffer + 1 ) ; | |
569 } | |
570 fclose( fp ) ; | |
571 | |
572 // Collect the split sites of subexons. | |
573 std::vector<struct _subexonSplit> subexonSplits ; | |
574 std::vector<struct _interval> intervalIrOverhang ; // intervals contains ir and overhang. | |
575 std::vector<struct _interval> introns ; | |
576 std::vector<struct _interval> exons ; | |
577 | |
578 | |
579 for ( k = 0 ; k < fileCnt ; ++k ) | |
580 { | |
581 fp = fopen( files[k], "r" ) ; | |
582 struct _subexon se ; | |
583 struct _subexonSplit sp ; | |
584 char chrName[50] ; | |
585 int origSize = subexonSplits.size() ; | |
586 while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) | |
587 { | |
588 if ( buffer[0] == '#' ) | |
589 continue ; | |
590 | |
591 SubexonGraph::InputSubexon( buffer, alignments, se, false) ; | |
592 // Record all the intron rentention, overhang from the samples | |
593 if ( ( se.leftType == 2 && se.rightType == 1 ) | |
594 || ( se.leftType == 2 && se.rightType == 0 ) | |
595 || ( se.leftType == 0 && se.rightType == 1 ) ) | |
596 { | |
597 struct _interval si ; | |
598 si.chrId = se.chrId ; | |
599 si.start = se.start ; | |
600 si.end = se.end ; | |
601 | |
602 intervalIrOverhang.push_back( si ) ; | |
603 } | |
604 | |
605 // Ignore overhang subexons and ir subexons for now. | |
606 if ( ( se.leftType == 0 && se.rightType == 1 ) | |
607 || ( se.leftType == 2 && se.rightType == 0 ) | |
608 || ( se.leftType == 2 && se.rightType == 1 ) ) | |
609 continue ; | |
610 | |
611 if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island | |
612 continue ; | |
613 if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) ) | |
614 continue ; | |
615 | |
616 if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here. | |
617 { | |
618 struct _interval ni ; | |
619 ni.chrId = se.chrId ; | |
620 ni.start = se.start ; | |
621 ni.end = se.end ; | |
622 ni.strand = se.rightStrand ; | |
623 ni.sampleSupport = 1 ; | |
624 exons.push_back( ni ) ; | |
625 } | |
626 | |
627 | |
628 /*for ( i = 0 ; i < se.nextCnt ; ++i ) | |
629 { | |
630 struct _interval ni ; | |
631 ni.chrId = se.chrId ; | |
632 ni.start = se.end ; | |
633 ni.end = se.next[i] ; | |
634 ni.strand = se.rightStrand ; | |
635 ni.sampleSupport = 1 ; | |
636 if ( ni.start + 1 < ni.end ) | |
637 introns.push_back( ni ) ; | |
638 }*/ | |
639 | |
640 sp.chrId = se.chrId ; | |
641 sp.pos = se.start ; | |
642 sp.type = 1 ; | |
643 sp.splitType = se.leftType ; | |
644 sp.strand = se.leftStrand ; | |
645 sp.weight = 1 ; | |
646 subexonSplits.push_back( sp ) ; | |
647 | |
648 sp.chrId = se.chrId ; | |
649 sp.pos = se.end ; | |
650 sp.type = 2 ; | |
651 sp.splitType = se.rightType ; | |
652 sp.strand = se.rightStrand ; | |
653 sp.weight = 1 ; | |
654 subexonSplits.push_back( sp ) ; | |
655 | |
656 /*if ( se.prevCnt > 0 ) | |
657 delete[] se.prev ; | |
658 if ( se.nextCnt > 0 ) | |
659 delete[] se.next ;*/ | |
660 } | |
661 CoalesceIntervals( exons ) ; | |
662 CoalesceIntervals( introns ) ; | |
663 CoalesceSubexonSplits( subexonSplits, origSize ) ; | |
664 CleanIntervalIrOverhang( intervalIrOverhang ) ; | |
665 | |
666 fclose( fp ) ; | |
667 } | |
668 | |
669 CoalesceDifferentStrandSubexonSplits( subexonSplits ) ; | |
670 | |
671 // Obtain the split sites from the introns. | |
672 int intronCnt = introns.size() ; | |
673 std::vector<struct _subexonSplit> intronSplits ; | |
674 for ( i = 0 ; i < intronCnt ; ++i ) | |
675 { | |
676 /*if ( introns[i].sampleSupport < 0.05 * fileCnt ) | |
677 { | |
678 continue ; | |
679 }*/ | |
680 struct _interval &it = introns[i] ; | |
681 struct _subexonSplit sp ; | |
682 sp.chrId = it.chrId ; | |
683 sp.pos = it.start ; | |
684 sp.type = 2 ; | |
685 sp.splitType = 2 ; | |
686 sp.strand = it.strand ; | |
687 intronSplits.push_back( sp ) ; | |
688 | |
689 sp.chrId = it.chrId ; | |
690 sp.pos = it.end ; | |
691 sp.type = 1 ; | |
692 sp.splitType = 1 ; | |
693 sp.strand = it.strand ; | |
694 intronSplits.push_back( sp ) ; | |
695 } | |
696 | |
697 // Pair up the split sites to get subexons | |
698 std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ; | |
699 //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; | |
700 | |
701 // Convert the hard boundary to soft boundary if the split sites is filtered from the introns | |
702 // Seems NO need to do this now. | |
703 int splitCnt = subexonSplits.size() ; | |
704 int intronSplitCnt = intronSplits.size() ; | |
705 k = 0 ; | |
706 //for ( i = 0 ; i < splitCnt ; ++i ) | |
707 while ( 0 ) | |
708 { | |
709 if ( subexonSplits[i].type != subexonSplits[i].splitType ) | |
710 continue ; | |
711 | |
712 while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId | |
713 || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) ) | |
714 ++k ; | |
715 j = k ; | |
716 while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId | |
717 && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType ) | |
718 ++j ; | |
719 | |
720 // the split site is filtered. | |
721 if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId || | |
722 intronSplits[j].pos > subexonSplits[i].pos ) | |
723 { | |
724 //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ; | |
725 subexonSplits[i].splitType = 0 ; | |
726 | |
727 // Convert the adjacent subexon split. | |
728 for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId | |
729 && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l ) | |
730 { | |
731 if ( subexonSplits[l].type != subexonSplits[i].type | |
732 && subexonSplits[l].splitType == subexonSplits[i].splitType ) | |
733 { | |
734 subexonSplits[l].splitType = 0 ; | |
735 } | |
736 } | |
737 | |
738 // And the other direction | |
739 for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId | |
740 && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l ) | |
741 { | |
742 if ( subexonSplits[l].type != subexonSplits[i].type | |
743 && subexonSplits[l].splitType == subexonSplits[i].splitType ) | |
744 { | |
745 subexonSplits[l].splitType = 0 ; | |
746 } | |
747 } | |
748 } | |
749 } | |
750 intronSplits.clear() ; | |
751 std::vector<struct _subexonSplit>().swap( intronSplits ) ; | |
752 | |
753 // Force the soft boundary that collides with hard boundaries to be hard boundary. | |
754 for ( i = 0 ; i < splitCnt ; ++i ) | |
755 { | |
756 if ( subexonSplits[i].splitType != 0 ) | |
757 continue ; | |
758 int newSplitType = 0 ; | |
759 int newStrand = subexonSplits[i].strand ; | |
760 for ( j = i + 1 ; j < splitCnt ; ++j ) | |
761 { | |
762 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || | |
763 subexonSplits[i].chrId != subexonSplits[j].chrId ) | |
764 break ; | |
765 if ( subexonSplits[j].splitType != 0 ) | |
766 { | |
767 newSplitType = subexonSplits[j].splitType ; | |
768 newStrand = subexonSplits[j].strand ; | |
769 break ; | |
770 } | |
771 } | |
772 | |
773 if ( newSplitType == 0 ) | |
774 { | |
775 for ( j = i - 1 ; j >= 0 ; --j ) | |
776 { | |
777 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || | |
778 subexonSplits[i].chrId != subexonSplits[j].chrId ) | |
779 break ; | |
780 if ( subexonSplits[j].splitType != 0 ) | |
781 { | |
782 newSplitType = subexonSplits[j].splitType ; | |
783 newStrand = subexonSplits[j].strand ; | |
784 break ; | |
785 } | |
786 } | |
787 | |
788 } | |
789 /*if ( subexonSplits[i].pos == 154464157 ) | |
790 { | |
791 printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ; | |
792 }*/ | |
793 subexonSplits[i].splitType = newSplitType ; | |
794 subexonSplits[i].strand = newStrand ; | |
795 } | |
796 | |
797 /*for ( i = 0 ; i < splitCnt ; ++i ) | |
798 { | |
799 printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ; | |
800 }*/ | |
801 | |
802 // Build subexons from the collected split sites. | |
803 | |
804 std::vector<struct _subexon> subexons ; | |
805 int diffCnt = 0 ; // |start of subexon split| - |end of subexon split| | |
806 int seCnt = 0 ; | |
807 for ( i = 0 ; i < splitCnt - 1 ; ++i ) | |
808 { | |
809 struct _subexon se ; | |
810 /*if ( subexonSplits[i + 1].pos == 144177260 ) | |
811 { | |
812 printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, | |
813 subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; | |
814 }*/ | |
815 | |
816 if ( subexonSplits[i].type == 1 ) | |
817 diffCnt += subexonSplits[i].weight ; | |
818 else | |
819 diffCnt -= subexonSplits[i].weight ; | |
820 | |
821 if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId ) | |
822 { | |
823 diffCnt = 0 ; | |
824 continue ; | |
825 } | |
826 | |
827 if ( diffCnt == 0 ) // the interval between subexon | |
828 continue ; | |
829 | |
830 se.chrId = subexonSplits[i].chrId ; | |
831 se.start = subexonSplits[i].pos ; | |
832 se.leftType = subexonSplits[i].splitType ; | |
833 se.leftStrand = subexonSplits[i].strand ; | |
834 if ( subexonSplits[i].type == 2 ) | |
835 { | |
836 se.leftStrand = 0 ; | |
837 ++se.start ; | |
838 } | |
839 | |
840 se.end = subexonSplits[i + 1].pos ; | |
841 se.rightType = subexonSplits[i + 1].splitType ; | |
842 se.rightStrand = subexonSplits[i + 1].strand ; | |
843 if ( subexonSplits[i + 1].type == 1 ) | |
844 { | |
845 se.rightStrand = 0 ; | |
846 --se.end ; | |
847 } | |
848 | |
849 /*if ( se.end == 24613649 ) | |
850 { | |
851 //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, | |
852 // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; | |
853 printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ; | |
854 }*/ | |
855 | |
856 if ( se.start > se.end ) //Note: this handles the case of repeated subexon split. | |
857 { | |
858 // handle the case in sample 0: [...[..] | |
859 // in sample 1: [..]...] | |
860 if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType ) | |
861 { | |
862 subexons[seCnt - 1].rightType = se.rightType ; | |
863 subexons[seCnt - 1].rightStrand = se.rightStrand ; | |
864 } | |
865 continue ; | |
866 } | |
867 se.leftClassifier = se.rightClassifier = 0 ; | |
868 se.lcCnt = se.rcCnt = 0 ; | |
869 | |
870 /*if ( 1 ) //se.chrId == 25 ) | |
871 { | |
872 printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ; | |
873 }*/ | |
874 | |
875 | |
876 se.next = se.prev = NULL ; | |
877 se.nextCnt = se.prevCnt = 0 ; | |
878 subexons.push_back( se ) ; | |
879 ++seCnt ; | |
880 } | |
881 subexonSplits.clear() ; | |
882 std::vector<struct _subexonSplit>().swap( subexonSplits ) ; | |
883 | |
884 // Adjust the split type. | |
885 seCnt = subexons.size() ; | |
886 for ( i = 1 ; i < seCnt ; ++i ) | |
887 { | |
888 if ( subexons[i - 1].end + 1 == subexons[i].start ) | |
889 { | |
890 if ( subexons[i - 1].rightType == 0 ) | |
891 subexons[i - 1].rightType = subexons[i].leftType ; | |
892 if ( subexons[i].leftType == 0 ) | |
893 subexons[i].leftType = subexons[i - 1].rightType ; | |
894 } | |
895 } | |
896 | |
897 // Merge the adjacent soft boundaries | |
898 std::vector<struct _subexon> rawSubexons = subexons ; | |
899 int exonCnt = exons.size() ; | |
900 subexons.clear() ; | |
901 | |
902 k = 0 ; // hold index for exon. | |
903 for ( i = 0 ; i < seCnt ; ) | |
904 { | |
905 /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0 | |
906 && rawSubexons[k].end + 1 == rawSubexons[i].start ) | |
907 { | |
908 rawSubexons[k].end = rawSubexons[i].end ; | |
909 rawSubexons[k].rightType = rawSubexons[i].rightType ; | |
910 rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ; | |
911 } | |
912 else | |
913 { | |
914 subexons.push_back( rawSubexons[k] ) ; | |
915 k = i ; | |
916 }*/ | |
917 | |
918 while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId | |
919 || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) ) | |
920 ++k ; | |
921 | |
922 for ( j = i + 1 ; j < seCnt ; ++j ) | |
923 { | |
924 if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0 | |
925 || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start ) | |
926 || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) ) | |
927 break ; | |
928 } | |
929 // rawsubexons[i...j-1] will be merged. | |
930 | |
931 /*if ( rawSubexons[i].start == 119625875 ) | |
932 { | |
933 printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType, | |
934 rawSubexons[j].start, rawSubexons[j].leftType ) ; | |
935 }*/ | |
936 bool merge = true ; | |
937 if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1 | |
938 && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) | |
939 { | |
940 merge = false ; | |
941 int sampleSupport = 0 ; | |
942 for ( int l = k ; l < exonCnt ; ++l ) | |
943 { | |
944 if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start ) | |
945 break ; | |
946 if ( exons[l].end == rawSubexons[j - 1].end ) | |
947 { | |
948 merge = true ; | |
949 sampleSupport = exons[l].sampleSupport ; | |
950 break ; | |
951 } | |
952 } | |
953 | |
954 if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) | |
955 { | |
956 if ( sampleSupport <= 0.2 * fileCnt ) | |
957 { | |
958 merge = false ; | |
959 } | |
960 } | |
961 | |
962 if ( merge == false ) | |
963 { | |
964 if ( j - i >= 3 ) | |
965 { | |
966 rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ; | |
967 rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ; | |
968 } | |
969 | |
970 if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start ) | |
971 { | |
972 --rawSubexons[i].end ; | |
973 ++rawSubexons[j - 1].start ; | |
974 } | |
975 subexons.push_back( rawSubexons[i] ) ; | |
976 subexons.push_back( rawSubexons[j - 1] ) ; | |
977 } | |
978 } | |
979 | |
980 if ( merge ) | |
981 { | |
982 rawSubexons[i].end = rawSubexons[j - 1].end ; | |
983 rawSubexons[i].rightType = rawSubexons[j - 1].rightType ; | |
984 rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ; | |
985 | |
986 if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 ) | |
987 { | |
988 rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ; | |
989 } | |
990 else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 ) | |
991 { | |
992 rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ; | |
993 } | |
994 | |
995 subexons.push_back( rawSubexons[i] ) ; | |
996 } | |
997 | |
998 i = j ; | |
999 } | |
1000 exons.clear() ; | |
1001 std::vector<struct _interval>().swap( exons ) ; | |
1002 | |
1003 // Remove overhang, ir subexons intron created after putting multiple sample to gether. | |
1004 // eg: s0: [......) | |
1005 // s1: [...]--------[....] | |
1006 // s2: [...]..)-----[....] | |
1007 // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1. | |
1008 // But be careful about how to compute the classifier for the overhang part contributed from s0. | |
1009 // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here. | |
1010 // eg: s0:[...]-----[...] | |
1011 // s1: (.) | |
1012 // s2:[.............] | |
1013 // After merge adjacent soft boundaries, the single-exon island will disappear. | |
1014 rawSubexons = subexons ; | |
1015 seCnt = subexons.size() ; | |
1016 subexons.clear() ; | |
1017 for ( i = 0 ; i < seCnt ; ++i ) | |
1018 { | |
1019 if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir | |
1020 || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang | |
1021 || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) ) | |
1022 continue ; | |
1023 subexons.push_back( rawSubexons[i] ) ; | |
1024 } | |
1025 | |
1026 // Remove the single-exon island if it overlaps with intron retentioned or overhang. | |
1027 rawSubexons = subexons ; | |
1028 seCnt = subexons.size() ; | |
1029 subexons.clear() ; | |
1030 k = 0 ; | |
1031 std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ; | |
1032 int irOverhangCnt = intervalIrOverhang.size() ; | |
1033 | |
1034 for ( i = 0 ; i < seCnt ; ++i ) | |
1035 { | |
1036 if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 ) | |
1037 { | |
1038 subexons.push_back( rawSubexons[i] ) ; | |
1039 continue ; | |
1040 } | |
1041 | |
1042 while ( k < irOverhangCnt ) | |
1043 { | |
1044 // Locate the interval that before the island | |
1045 if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId | |
1046 || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) ) | |
1047 { | |
1048 ++k ; | |
1049 continue ; | |
1050 } | |
1051 break ; | |
1052 } | |
1053 bool overlap = false ; | |
1054 for ( j = k ; j < irOverhangCnt ; ++j ) | |
1055 { | |
1056 if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end ) | |
1057 break ; | |
1058 if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start ) | |
1059 || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) ) | |
1060 { | |
1061 overlap = true ; | |
1062 break ; | |
1063 } | |
1064 } | |
1065 | |
1066 if ( !overlap ) | |
1067 subexons.push_back( rawSubexons[i] ) ; | |
1068 } | |
1069 rawSubexons.clear() ; | |
1070 std::vector<struct _subexon>().swap( rawSubexons ) ; | |
1071 | |
1072 intervalIrOverhang.clear() ; | |
1073 std::vector<struct _interval>().swap( intervalIrOverhang ) ; | |
1074 | |
1075 // Create the dummy intervals. | |
1076 seCnt = subexons.size() ; | |
1077 std::vector<struct _intronicInfo> intronicInfos ; | |
1078 std::vector<struct _seInterval> seIntervals ; | |
1079 std::vector<struct _subexonSupplement> subexonInfo ; | |
1080 | |
1081 //subexonInfo.resize( seCnt ) ; | |
1082 for ( i = 0 ; i < seCnt ; ++i ) | |
1083 { | |
1084 struct _seInterval ni ; // new interval | |
1085 ni.start = subexons[i].start ; | |
1086 ni.end = subexons[i].end ; | |
1087 ni.type = 0 ; | |
1088 ni.idx = i ; | |
1089 ni.chrId = subexons[i].chrId ; | |
1090 seIntervals.push_back( ni ) ; | |
1091 | |
1092 /*if ( subexons[i].end == 42671717 ) | |
1093 { | |
1094 printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ; | |
1095 }*/ | |
1096 //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ; | |
1097 | |
1098 /*int nexti ; | |
1099 for ( nexti = i + 1 ; nexti < seCnt ; ++nexti ) | |
1100 if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/ | |
1101 | |
1102 if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId && | |
1103 subexons[i].end + 1 < subexons[i + 1].start && | |
1104 subexons[i].rightType + subexons[i + 1].leftType != 0 ) | |
1105 { | |
1106 // Only consider the intervals like ]..[,]...(, )...[ | |
1107 // The case like ]...] is actaully things like ][...] in subexon perspective, | |
1108 // so they won't pass the if-statement | |
1109 struct _intronicInfo nii ; // new intronic info | |
1110 ni.start = subexons[i].end + 1 ; | |
1111 ni.end = subexons[i + 1].start - 1 ; | |
1112 ni.type = 1 ; | |
1113 ni.idx = intronicInfos.size() ; | |
1114 seIntervals.push_back( ni ) ; | |
1115 | |
1116 nii.chrId = subexons[i].chrId ; | |
1117 nii.start = ni.start ; | |
1118 nii.end = ni.end ; | |
1119 nii.leftSubexonIdx = i ; | |
1120 nii.rightSubexonIdx = i + 1 ; | |
1121 nii.irClassifier = 0 ; | |
1122 nii.irCnt = 0 ; | |
1123 nii.validIrCnt = 0 ; | |
1124 nii.leftOverhang.cnt = 0 ; | |
1125 nii.leftOverhang.validCnt = 0 ; | |
1126 nii.leftOverhang.length = 0 ; | |
1127 nii.leftOverhang.classifier = 0 ; | |
1128 nii.rightOverhang.cnt = 0 ; | |
1129 nii.rightOverhang.validCnt = 0 ; | |
1130 nii.rightOverhang.length = 0 ; | |
1131 nii.rightOverhang.classifier = 0 ; | |
1132 intronicInfos.push_back( nii ) ; | |
1133 /*if ( nii.end == 23667 ) | |
1134 { | |
1135 printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ; | |
1136 }*/ | |
1137 } | |
1138 } | |
1139 | |
1140 // Go through all the files to get some statistics number | |
1141 double avgIrPiRatio = 0 ; | |
1142 double avgIrPiCov = 0 ; | |
1143 double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results | |
1144 double irPiCov, irKCov[2], irThetaCov[2] ; | |
1145 | |
1146 double avgOverhangPiRatio = 0 ; | |
1147 double avgOverhangPiCov = 0 ; | |
1148 double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results | |
1149 double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; | |
1150 | |
1151 for ( k = 0 ; k < fileCnt ; ++k ) | |
1152 { | |
1153 fp = fopen( files[k], "r" ) ; | |
1154 | |
1155 while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) | |
1156 { | |
1157 if ( buffer[0] == '#' ) | |
1158 { | |
1159 char buffer2[100] ; | |
1160 sscanf( buffer, "%s", buffer2 ) ; | |
1161 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) | |
1162 { | |
1163 // TODO: ignore certain samples if the coverage seems wrong. | |
1164 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
1165 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], | |
1166 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; | |
1167 avgIrPiRatio += irPiRatio ; | |
1168 } | |
1169 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) | |
1170 { | |
1171 } | |
1172 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) | |
1173 { | |
1174 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
1175 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], | |
1176 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; | |
1177 avgOverhangPiRatio += overhangPiRatio ; | |
1178 } | |
1179 } | |
1180 else | |
1181 break ; | |
1182 } | |
1183 fclose( fp ) ; | |
1184 } | |
1185 avgIrPiRatio /= fileCnt ; | |
1186 avgOverhangPiRatio /= fileCnt ; | |
1187 | |
1188 // Go through all the files to put statistical results into each subexon. | |
1189 std::vector< struct _subexon > sampleSubexons ; | |
1190 int subexonCnt = subexons.size() ; | |
1191 for ( k = 0 ; k < fileCnt ; ++k ) | |
1192 { | |
1193 //if ( k == 220 ) | |
1194 // exit( 1 ) ; | |
1195 fp = fopen( files[k], "r" ) ; | |
1196 struct _subexon se ; | |
1197 struct _subexonSplit sp ; | |
1198 char chrName[50] ; | |
1199 | |
1200 sampleSubexons.clear() ; | |
1201 | |
1202 int tag = 0 ; | |
1203 while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) | |
1204 { | |
1205 if ( buffer[0] == '#' ) | |
1206 { | |
1207 char buffer2[200] ; | |
1208 sscanf( buffer, "%s", buffer2 ) ; | |
1209 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) | |
1210 { | |
1211 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
1212 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], | |
1213 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; | |
1214 } | |
1215 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) | |
1216 { | |
1217 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
1218 buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0], | |
1219 buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ; | |
1220 } | |
1221 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) | |
1222 { | |
1223 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
1224 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], | |
1225 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; | |
1226 } | |
1227 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) ) | |
1228 { | |
1229 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
1230 buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0], | |
1231 buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ; | |
1232 } | |
1233 continue ; | |
1234 } | |
1235 else | |
1236 break ; | |
1237 | |
1238 //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; | |
1239 //sampleSubexons.push_back( se ) ; | |
1240 } | |
1241 | |
1242 //int sampleSubexonCnt = sampleSubexons.size() ; | |
1243 int intervalCnt = seIntervals.size() ; | |
1244 //for ( i = 0 ; i < sampleSubexonCnt ; ++i ) | |
1245 int iterCnt = 0 ; | |
1246 while ( 1 ) | |
1247 { | |
1248 if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL) | |
1249 break ; | |
1250 ++iterCnt ; | |
1251 | |
1252 struct _subexon se ; | |
1253 SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; | |
1254 | |
1255 while ( tag < intervalCnt ) | |
1256 { | |
1257 if ( seIntervals[tag].chrId < se.chrId || | |
1258 ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) ) | |
1259 { | |
1260 ++tag ; | |
1261 continue ; | |
1262 } | |
1263 else | |
1264 break ; | |
1265 } | |
1266 | |
1267 for ( j = tag ; j < intervalCnt ; ++j ) | |
1268 { | |
1269 if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap. | |
1270 break ; | |
1271 int idx ; | |
1272 | |
1273 if ( seIntervals[j].type == 0 ) | |
1274 { | |
1275 idx = seIntervals[j].idx ; | |
1276 if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start ) | |
1277 { | |
1278 double tmp = se.leftClassifier ; | |
1279 if ( se.leftClassifier == 0 ) | |
1280 tmp = 1e-7 ; | |
1281 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; | |
1282 ++subexons[idx].lcCnt ; | |
1283 subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt, | |
1284 se.prev, se.prevCnt, subexons[idx].prevCnt ) ; | |
1285 | |
1286 if ( se.rightType == 0 ) // a gene end here | |
1287 { | |
1288 for ( int l = idx ; l < subexonCnt ; ++l ) | |
1289 { | |
1290 if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1 | |
1291 || subexons[l].chrId != subexons[l - 1].chrId ) ) | |
1292 break ; | |
1293 if ( subexons[l].rightType == 2 ) | |
1294 { | |
1295 double adjustAvgDepth = se.avgDepth ; | |
1296 if ( se.end - se.start + 1 >= 100 ) | |
1297 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; | |
1298 else | |
1299 adjustAvgDepth *= 2 ; | |
1300 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; | |
1301 //if ( se.end - se.start + 1 >= 500 && p > 0.001 ) | |
1302 // p = 0.001 ; | |
1303 | |
1304 subexons[l].rightClassifier -= 2.0 * log( p ) ; | |
1305 ++subexons[l].rcCnt ; | |
1306 break ; | |
1307 } | |
1308 } | |
1309 } | |
1310 } | |
1311 //if ( se.chrId == 25 ) | |
1312 // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ; | |
1313 if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end ) | |
1314 { | |
1315 double tmp = se.rightClassifier ; | |
1316 if ( se.rightClassifier == 0 ) | |
1317 tmp = 1e-7 ; | |
1318 subexons[idx].rightClassifier -= 2.0 * log( tmp ) ; | |
1319 ++subexons[idx].rcCnt ; | |
1320 | |
1321 subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt, | |
1322 se.next, se.nextCnt, subexons[idx].nextCnt ) ; | |
1323 | |
1324 if ( se.leftType == 0 ) | |
1325 { | |
1326 for ( int l = idx ; l >= 0 ; --l ) | |
1327 { | |
1328 if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1 | |
1329 || subexons[l].chrId != subexons[l + 1].chrId ) ) | |
1330 break ; | |
1331 if ( subexons[l].leftType == 1 ) | |
1332 { | |
1333 double adjustAvgDepth = se.avgDepth ; | |
1334 if ( se.end - se.start + 1 >= 100 ) | |
1335 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; | |
1336 else | |
1337 adjustAvgDepth *= 2 ; | |
1338 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; | |
1339 //if ( se.end - se.start + 1 >= 500 && p >= 0.001 ) | |
1340 // p = 0.001 ; | |
1341 subexons[l].leftClassifier -= 2.0 * log( p ) ; | |
1342 ++subexons[l].lcCnt ; | |
1343 break ; | |
1344 } | |
1345 } | |
1346 } | |
1347 } | |
1348 | |
1349 if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0 | |
1350 && se.leftType == 0 && se.rightType == 0 ) // the single-exon island. | |
1351 { | |
1352 double tmp = se.leftClassifier ; | |
1353 if ( se.leftClassifier == 0 ) | |
1354 tmp = 1e-7 ; | |
1355 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; | |
1356 subexons[idx].rightClassifier = subexons[idx].leftClassifier ; | |
1357 ++subexons[idx].lcCnt ; | |
1358 ++subexons[idx].rcCnt ; | |
1359 } | |
1360 } | |
1361 else if ( seIntervals[j].type == 1 ) | |
1362 { | |
1363 idx = seIntervals[j].idx ; | |
1364 // Overlap on the left part of intron | |
1365 if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end | |
1366 && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 ) | |
1367 { | |
1368 int len = se.end - intronicInfos[idx].start + 1 ; | |
1369 intronicInfos[idx].leftOverhang.length += len ; | |
1370 ++intronicInfos[idx].leftOverhang.cnt ; | |
1371 | |
1372 // Note that the sample subexon must have a soft boundary at right hand side, | |
1373 // otherwise, this part is not an intron and won't show up in intronic Info. | |
1374 if ( se.leftType == 2 ) | |
1375 { | |
1376 if ( se.leftRatio > 0 && se.avgDepth > 1 ) | |
1377 { | |
1378 ++intronicInfos[idx].leftOverhang.validCnt ; | |
1379 | |
1380 double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth, | |
1381 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
1382 overhangPiCov, overhangKCov, overhangThetaCov, false ) ; | |
1383 intronicInfos[idx].leftOverhang.classifier += update ; | |
1384 } | |
1385 } | |
1386 else if ( se.leftType == 1 ) | |
1387 { | |
1388 ++intronicInfos[idx].leftOverhang.validCnt ; | |
1389 double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth, | |
1390 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
1391 overhangPiCov, overhangKCov, overhangThetaCov, true ) ; | |
1392 intronicInfos[idx].leftOverhang.classifier += update ; | |
1393 | |
1394 int seIdx = intronicInfos[idx].leftSubexonIdx ; | |
1395 subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; | |
1396 ++subexons[ seIdx ].rcCnt ; | |
1397 } | |
1398 // ignore the contribution of single-exon island here? | |
1399 } | |
1400 // Overlap on the right part of intron | |
1401 else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end | |
1402 && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 ) | |
1403 { | |
1404 int len = intronicInfos[idx].end - se.start + 1 ; | |
1405 intronicInfos[idx].rightOverhang.length += len ; | |
1406 ++intronicInfos[idx].rightOverhang.cnt ; | |
1407 | |
1408 // Note that the sample subexon must have a soft boundary at left hand side, | |
1409 // otherwise, this won't show up in intronic Info | |
1410 if ( se.rightType == 1 ) | |
1411 { | |
1412 if ( se.rightRatio > 0 && se.avgDepth > 1 ) | |
1413 { | |
1414 ++intronicInfos[idx].rightOverhang.validCnt ; | |
1415 | |
1416 double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth, | |
1417 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
1418 overhangPiCov, overhangKCov, overhangThetaCov, false ) ; | |
1419 intronicInfos[idx].rightOverhang.classifier += update ; | |
1420 } | |
1421 } | |
1422 else if ( se.rightType == 2 ) | |
1423 { | |
1424 ++intronicInfos[idx].rightOverhang.validCnt ; | |
1425 | |
1426 double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth, | |
1427 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
1428 overhangPiCov, overhangKCov, overhangThetaCov, true ) ; | |
1429 intronicInfos[idx].rightOverhang.classifier += update ; | |
1430 | |
1431 int seIdx = intronicInfos[idx].rightSubexonIdx ; | |
1432 /*if ( subexons[ seIdx ].start == 6873648 ) | |
1433 { | |
1434 printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ; | |
1435 }*/ | |
1436 subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; | |
1437 ++subexons[ seIdx ].lcCnt ; | |
1438 } | |
1439 } | |
1440 // Intron is fully contained in this sample subexon, then it is a ir candidate | |
1441 else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end ) | |
1442 { | |
1443 if ( se.leftType == 2 && se.rightType == 1 ) | |
1444 { | |
1445 double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ; | |
1446 ++intronicInfos[idx].irCnt ; | |
1447 if ( ratio > 0 && se.avgDepth > 1 ) | |
1448 { | |
1449 double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth, | |
1450 irPiRatio, irKRatio, irThetaRatio, | |
1451 irPiCov, irKCov, irThetaCov, true ) ; | |
1452 //if ( intronicInfos[idx].start == 37617368 ) | |
1453 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; | |
1454 intronicInfos[idx].irClassifier += update ; | |
1455 ++intronicInfos[idx].validIrCnt ; | |
1456 } | |
1457 } | |
1458 else if ( se.leftType == 1 || se.rightType == 2 ) | |
1459 { | |
1460 //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] ) | |
1461 // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ; | |
1462 /*if ( se.start == 37617368 ) | |
1463 { | |
1464 printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ), | |
1465 MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ; | |
1466 }*/ | |
1467 if ( se.avgDepth > 1 ) | |
1468 { | |
1469 // let the depth be the threshold to determine. | |
1470 double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth, | |
1471 irPiRatio, irKRatio, irThetaRatio, | |
1472 irPiCov, irKCov, irThetaCov, true ) ; | |
1473 //if ( intronicInfos[idx].start == 36266630 ) | |
1474 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; | |
1475 intronicInfos[idx].irClassifier += update ; | |
1476 ++intronicInfos[idx].irCnt ; | |
1477 ++intronicInfos[idx].validIrCnt ; | |
1478 } | |
1479 } | |
1480 else | |
1481 { | |
1482 // the intron is contained in a overhang subexon from the sample or single-exon island | |
1483 } | |
1484 } | |
1485 // sample subexon is contained in the intron. | |
1486 else | |
1487 { | |
1488 // Do nothing. | |
1489 } | |
1490 } | |
1491 } | |
1492 | |
1493 //if ( se.nextCnt > 0 ) | |
1494 delete[] se.next ; | |
1495 //if ( se.prevCnt > 0 ) | |
1496 delete[] se.prev ; | |
1497 } | |
1498 fclose( fp ) ; | |
1499 | |
1500 /*for ( i = 0 ; i < sampleSubexonCnt ; ++i ) | |
1501 { | |
1502 if ( sampleSubexons[i].nextCnt > 0 ) | |
1503 delete[] sampleSubexons[i].next ; | |
1504 if ( sampleSubexons[i].prevCnt > 0 ) | |
1505 delete[] sampleSubexons[i].prev ; | |
1506 }*/ | |
1507 } | |
1508 | |
1509 CleanUpSubexonConnections( subexons ) ; | |
1510 | |
1511 // Convert the temporary statistics number into formal statistics result. | |
1512 for ( i = 0 ; i < subexonCnt ; ++i ) | |
1513 { | |
1514 struct _subexon &se = subexons[i] ; | |
1515 if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt. | |
1516 { | |
1517 se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; | |
1518 } | |
1519 else | |
1520 { | |
1521 if ( se.leftType == 1 ) | |
1522 { | |
1523 se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ; | |
1524 } | |
1525 else | |
1526 se.leftClassifier = -1 ; | |
1527 | |
1528 if ( se.rightType == 2 ) | |
1529 se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; | |
1530 else | |
1531 se.rightClassifier = -1 ; | |
1532 } | |
1533 } | |
1534 | |
1535 int iiCnt = intronicInfos.size() ; //intronicInfo count | |
1536 for ( i = 0 ; i < iiCnt ; ++i ) | |
1537 { | |
1538 struct _intronicInfo &ii = intronicInfos[i] ; | |
1539 if ( ii.validIrCnt > 0 ) | |
1540 { | |
1541 for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j ) | |
1542 { | |
1543 ii.irClassifier -= log( 10.0 ) ; | |
1544 } | |
1545 /*if ( ii.validIrCnt < fileCnt * 0.15 ) | |
1546 ii.irClassifier -= log( 1000.0 ) ; | |
1547 else if ( ii.validIrCnt < fileCnt * 0.5 ) | |
1548 ii.irClassifier -= log( 100.0 ) ;*/ | |
1549 ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ; | |
1550 } | |
1551 else | |
1552 ii.irClassifier = -1 ; | |
1553 | |
1554 if ( ii.leftOverhang.validCnt > 0 ) | |
1555 ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier + | |
1556 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; | |
1557 else | |
1558 ii.leftOverhang.classifier = -1 ; | |
1559 | |
1560 if ( ii.rightOverhang.validCnt > 0 ) | |
1561 ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier + | |
1562 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; | |
1563 else | |
1564 ii.rightOverhang.classifier = -1 ; | |
1565 } | |
1566 | |
1567 // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier | |
1568 // which collide with overhang subexon. | |
1569 int intervalCnt = seIntervals.size() ; | |
1570 for ( i = 0 ; i < intervalCnt ; ++i ) | |
1571 { | |
1572 if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 ) | |
1573 { | |
1574 int idx = seIntervals[i].idx ; | |
1575 if ( intronicInfos[idx].leftOverhang.cnt > 0 ) | |
1576 { | |
1577 int k = seIntervals[i - 1].idx ; | |
1578 // Should aim for more conservative? | |
1579 if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier ) | |
1580 subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ; | |
1581 } | |
1582 | |
1583 if ( intronicInfos[idx].rightOverhang.cnt > 0 ) | |
1584 { | |
1585 int k = seIntervals[i + 1].idx ; | |
1586 if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier ) | |
1587 subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ; | |
1588 } | |
1589 } | |
1590 } | |
1591 | |
1592 // Output the result. | |
1593 for ( i = 0 ; i < intervalCnt ; ++i ) | |
1594 { | |
1595 if ( seIntervals[i].type == 0 ) | |
1596 { | |
1597 struct _subexon &se = subexons[ seIntervals[i].idx ] ; | |
1598 | |
1599 char ls, rs ; | |
1600 | |
1601 ls = StrandNumToSymbol( se.leftStrand ) ; | |
1602 rs = StrandNumToSymbol( se.rightStrand ) ; | |
1603 | |
1604 printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end, | |
1605 se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ; | |
1606 if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId | |
1607 && seIntervals[i - 1].end + 1 == seIntervals[i].start | |
1608 && !( seIntervals[i - 1].type == 0 && | |
1609 subexons[ seIntervals[i - 1].idx ].rightType != se.leftType ) | |
1610 && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0 | |
1611 && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 ) | |
1612 && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file. | |
1613 { | |
1614 printf( "%d ", se.prevCnt + 1 ) ; | |
1615 for ( j = 0 ; j < se.prevCnt ; ++j ) | |
1616 printf( "%d ", se.prev[j] ) ; | |
1617 printf( "%d ", se.start - 1 ) ; | |
1618 } | |
1619 else | |
1620 { | |
1621 printf( "%d ", se.prevCnt ) ; | |
1622 for ( j = 0 ; j < se.prevCnt ; ++j ) | |
1623 printf( "%d ", se.prev[j] ) ; | |
1624 } | |
1625 | |
1626 if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId | |
1627 && seIntervals[i].end == seIntervals[i + 1].start - 1 | |
1628 && !( seIntervals[i + 1].type == 0 && | |
1629 subexons[ seIntervals[i + 1].idx ].leftType != se.rightType ) | |
1630 && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0 | |
1631 && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 ) | |
1632 && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) ) | |
1633 { | |
1634 printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ; | |
1635 } | |
1636 else | |
1637 printf( "%d ", se.nextCnt ) ; | |
1638 for ( j = 0 ; j < se.nextCnt ; ++j ) | |
1639 printf( "%d ", se.next[j] ) ; | |
1640 printf( "\n" ) ; | |
1641 } | |
1642 else if ( seIntervals[i].type == 1 ) | |
1643 { | |
1644 struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ; | |
1645 if ( ii.irCnt > 0 ) | |
1646 { | |
1647 printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n", | |
1648 alignments.GetChromName( ii.chrId ), ii.start, ii.end, | |
1649 ii.irClassifier, ii.irClassifier, | |
1650 seIntervals[i - 1].end, seIntervals[i + 1].start ) ; | |
1651 } | |
1652 else | |
1653 { | |
1654 // left overhang. | |
1655 if ( ii.leftOverhang.cnt > 0 ) | |
1656 { | |
1657 printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n", | |
1658 alignments.GetChromName( ii.chrId ), ii.start, | |
1659 ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1, | |
1660 ii.leftOverhang.classifier, ii.leftOverhang.classifier, | |
1661 ii.start - 1 ) ; | |
1662 } | |
1663 | |
1664 // right overhang. | |
1665 if ( ii.rightOverhang.cnt > 0 ) | |
1666 { | |
1667 printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n", | |
1668 alignments.GetChromName( ii.chrId ), | |
1669 ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end, | |
1670 ii.rightOverhang.classifier, ii.rightOverhang.classifier, | |
1671 ii.end + 1 ) ; | |
1672 } | |
1673 | |
1674 } | |
1675 } | |
1676 } | |
1677 | |
1678 return 0 ; | |
1679 } | |
1680 |