0
|
1 #include <stdio.h>
|
|
2 #include <string.h>
|
|
3
|
|
4 #include <algorithm>
|
|
5 #include <vector>
|
|
6 #include <map>
|
|
7 #include <string>
|
|
8
|
|
9 #include "alignments.hpp"
|
|
10 #include "blocks.hpp"
|
|
11 #include "stats.hpp"
|
|
12 #include "SubexonGraph.hpp"
|
|
13
|
|
14 char usage[] = "combineSubexons [options]\n"
|
|
15 "Required options:\n"
|
|
16 "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n"
|
|
17 "\t\tor\n"
|
|
18 "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ;
|
|
19
|
|
20 struct _overhang
|
|
21 {
|
|
22 int cnt ; // the number of samples support this subexon.
|
|
23 int validCnt ; // The number of samples that are used for compute probability.
|
|
24 int length ;
|
|
25 double classifier ;
|
|
26 } ;
|
|
27
|
|
28 struct _intronicInfo
|
|
29 {
|
|
30 int chrId ;
|
|
31 int start, end ;
|
|
32 int leftSubexonIdx, rightSubexonIdx ;
|
|
33 double irClassifier ;
|
|
34 int irCnt ;
|
|
35 int validIrCnt ;
|
|
36 struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron.
|
|
37 } ;
|
|
38
|
|
39 struct _seInterval
|
|
40 {
|
|
41 int chrId ;
|
|
42 int start, end ;
|
|
43 int type ; // 0-subexon, 1-intronicInfo
|
|
44 int idx ;
|
|
45 } ;
|
|
46
|
|
47 struct _subexonSplit
|
|
48 {
|
|
49 int chrId ;
|
|
50 int pos ;
|
|
51 int type ; //1-start of a subexon. 2-end of a subexon
|
|
52 int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon.
|
|
53 int strand ;
|
|
54
|
|
55 int weight ;
|
|
56 } ;
|
|
57
|
|
58 struct _interval // exon or intron
|
|
59 {
|
|
60 int chrId ;
|
|
61 int start, end ;
|
|
62 int strand ;
|
|
63 int sampleSupport ;
|
|
64 } ;
|
|
65
|
|
66 struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph.
|
|
67 {
|
|
68 int *nextSupport ;
|
|
69 int *prevSupport ;
|
|
70 } ;
|
|
71
|
|
72 char buffer[4096] ;
|
|
73
|
|
74 bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b )
|
|
75 {
|
|
76 if ( a.chrId < b.chrId )
|
|
77 return true ;
|
|
78 else if ( a.chrId > b.chrId )
|
|
79 return false ;
|
|
80 else if ( a.pos != b.pos )
|
|
81 return a.pos < b.pos ;
|
|
82 else if ( a.type != b.type )
|
|
83 {
|
|
84 // the split site with no strand information should come first.
|
|
85 /*if ( a.splitType != b.splitType )
|
|
86 {
|
|
87 if ( a.splitType == 0 )
|
|
88 return true ;
|
|
89 else if ( b.splitType == 0 )
|
|
90 return false ;
|
|
91 }*/
|
|
92 return a.type < b.type ;
|
|
93 }
|
|
94 else if ( a.splitType != b.splitType )
|
|
95 {
|
|
96 //return a.splitType < b.splitType ;
|
|
97 if ( a.splitType == 0 )
|
|
98 return true ;
|
|
99 else if ( b.splitType == 0 )
|
|
100 return false ;
|
|
101
|
|
102 if ( a.type == 1 )
|
|
103 return a.splitType > b.splitType ;
|
|
104 else
|
|
105 return a.splitType < b.splitType ;
|
|
106 }
|
|
107
|
|
108 return false ;
|
|
109 }
|
|
110
|
|
111 bool CompInterval( struct _interval a, struct _interval b )
|
|
112 {
|
|
113 if ( a.chrId < b.chrId )
|
|
114 return true ;
|
|
115 else if ( a.chrId > b.chrId )
|
|
116 return false ;
|
|
117 else if ( a.start != b.start )
|
|
118 return a.start < b.start ;
|
|
119 else if ( a.end != b.end )
|
|
120 return a.end < b.end ;
|
|
121 return false ;
|
|
122 }
|
|
123
|
|
124 bool CompSeInterval( struct _seInterval a, struct _seInterval b )
|
|
125 {
|
|
126 if ( a.chrId < b.chrId )
|
|
127 return true ;
|
|
128 else if ( a.chrId > b.chrId )
|
|
129 return false ;
|
|
130 else if ( a.start < b.start )
|
|
131 return true ;
|
|
132 else if ( a.start > b.start )
|
|
133 return false ;
|
|
134 else if ( a.end < b.end )
|
|
135 return true ;
|
|
136 else
|
|
137 return false ;
|
|
138 }
|
|
139
|
|
140 // Keep this the same as in SubexonInfo.cpp.
|
|
141 double TransformCov( double c )
|
|
142 {
|
|
143 double ret ;
|
|
144 //return sqrt( c ) - 1 ;
|
|
145
|
|
146 if ( c <= 2 + 1e-6 )
|
|
147 ret = 1e-6 ;
|
|
148 else
|
|
149 ret = c - 2 ;
|
|
150
|
|
151 return ret ;
|
|
152 }
|
|
153
|
|
154 double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2],
|
|
155 double piCov, double kCov[2], double thetaCov[2], bool conservative )
|
|
156 {
|
|
157 double p1 = 0, p2 ;
|
|
158
|
|
159 cov = TransformCov( cov ) ;
|
|
160 if ( cov < ( kCov[0] - 1 ) * thetaCov[0] )
|
|
161 cov = ( kCov[0] - 1 ) * thetaCov[0] ;
|
|
162
|
|
163 if ( ratio > 0 )
|
|
164 p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ;
|
|
165 // Make sure cov > 1?
|
|
166 p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ;
|
|
167 double ret = 0 ;
|
|
168
|
|
169 if ( conservative )
|
|
170 {
|
|
171 if ( p1 >= p2 ) // we should use ratio.
|
|
172 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] )
|
|
173 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ;
|
|
174 else
|
|
175 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] )
|
|
176 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ;
|
|
177 }
|
|
178 else
|
|
179 {
|
|
180 if ( p1 >= p2 ) // we should use ratio.
|
|
181 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] )
|
|
182 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ;
|
|
183 else
|
|
184 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] )
|
|
185 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ;
|
|
186 }
|
|
187 return ret ;
|
|
188 }
|
|
189
|
|
190 double GetPValueOfGeneEnd( double cov )
|
|
191 {
|
|
192 if ( cov <= 2.0 )
|
|
193 return 1.0 ;
|
|
194 double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ;
|
|
195 if ( tmp <= 0 )
|
|
196 return 1.0 ;
|
|
197 return 2.0 * alnorm( tmp, true ) ;
|
|
198 }
|
|
199
|
|
200 char StrandNumToSymbol( int strand )
|
|
201 {
|
|
202 if ( strand > 0 )
|
|
203 return '+' ;
|
|
204 else if ( strand < 0 )
|
|
205 return '-' ;
|
|
206 else
|
|
207 return '.' ;
|
|
208 }
|
|
209
|
|
210 int StrandSymbolToNum( char c )
|
|
211 {
|
|
212 if ( c == '+' )
|
|
213 return 1 ;
|
|
214 else if ( c == '-' )
|
|
215 return -1 ;
|
|
216 else
|
|
217 return 0 ;
|
|
218 }
|
|
219
|
|
220 int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support )
|
|
221 {
|
|
222 int i, j, k ;
|
|
223 int *ret ;
|
|
224 if ( acnt == 0 )
|
|
225 {
|
|
226 newCnt = ocnt ;
|
|
227 return old ;
|
|
228 }
|
|
229 if ( ocnt == 0 )
|
|
230 {
|
|
231 newCnt = acnt ;
|
|
232 ret = new int[acnt] ;
|
|
233 //*support = new int[acnt] ;
|
|
234 for ( i = 0 ; i < acnt ; ++i )
|
|
235 {
|
|
236 //(*support)[i] = 1 ;
|
|
237 ret[i] = add[i] ;
|
|
238 }
|
|
239 return ret ;
|
|
240 }
|
|
241 newCnt = 0 ;
|
|
242 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; )
|
|
243 {
|
|
244 if ( old[i] < add[j] )
|
|
245 {
|
|
246 ++i ;
|
|
247 ++newCnt ;
|
|
248 }
|
|
249 else if ( old[i] == add[j] )
|
|
250 {
|
|
251 ++i ; ++j ;
|
|
252 ++newCnt ;
|
|
253 }
|
|
254 else
|
|
255 {
|
|
256 ++j ;
|
|
257 ++newCnt ;
|
|
258 }
|
|
259 }
|
|
260 newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ;
|
|
261 // no new elements.
|
|
262 if ( newCnt == ocnt )
|
|
263 {
|
|
264 /*i = 0 ;
|
|
265 for ( j = 0 ; j < acnt ; ++j )
|
|
266 {
|
|
267 for ( ; old[i] < add[j] ; ++i )
|
|
268 ;
|
|
269 ++(*support)[i] ;
|
|
270 }*/
|
|
271 return old ;
|
|
272 }
|
|
273 k = 0 ;
|
|
274 //delete []old ;
|
|
275 ret = new int[ newCnt ] ;
|
|
276 //int *bufferSupport = new int[newCnt] ;
|
|
277 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; )
|
|
278 {
|
|
279 if ( old[i] < add[j] )
|
|
280 {
|
|
281 ret[k] = old[i] ;
|
|
282 //bufferSupport[k] = (*support)[i] ;
|
|
283 ++i ;
|
|
284 ++k ;
|
|
285 }
|
|
286 else if ( old[i] == add[j] )
|
|
287 {
|
|
288 ret[k] = old[i] ;
|
|
289 //bufferSupport[k] = (*support)[i] + 1 ;
|
|
290 ++i ; ++j ;
|
|
291 ++k ;
|
|
292 }
|
|
293 else
|
|
294 {
|
|
295 ret[k] = add[j] ;
|
|
296 //bufferSupport[k] = 1 ;
|
|
297 ++j ;
|
|
298 ++k ;
|
|
299 }
|
|
300 }
|
|
301 for ( ; i < ocnt ; ++i, ++k )
|
|
302 {
|
|
303 ret[k] = old[i] ;
|
|
304 //bufferSupport[k] = (*support)[i] ;
|
|
305 }
|
|
306 for ( ; j < acnt ; ++j, ++k )
|
|
307 {
|
|
308 ret[k] = add[j] ;
|
|
309 //bufferSupport[k] = 1 ;
|
|
310 }
|
|
311 delete[] old ;
|
|
312 //delete[] *support ;
|
|
313 //*support = bufferSupport ;
|
|
314 return ret ;
|
|
315 }
|
|
316
|
|
317 void CoalesceSubexonSplits( std::vector<struct _subexonSplit> &splits, int mid )
|
|
318 {
|
|
319 int i, j, k ;
|
|
320 int cnt = splits.size() ;
|
|
321 //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ;
|
|
322
|
|
323 std::vector<struct _subexonSplit> sortedSplits ;
|
|
324 sortedSplits.resize( cnt ) ;
|
|
325
|
|
326 k = 0 ;
|
|
327 for ( i = 0, j = mid ; i < mid && j < cnt ; ++k )
|
|
328 {
|
|
329 if ( CompSubexonSplit( splits[i], splits[j] ) )
|
|
330 {
|
|
331 sortedSplits[k] = splits[i] ;
|
|
332 ++i ;
|
|
333 }
|
|
334 else
|
|
335 {
|
|
336 sortedSplits[k] = splits[j] ;
|
|
337 ++j ;
|
|
338 }
|
|
339 }
|
|
340 for ( ; i < mid ; ++i, ++k )
|
|
341 sortedSplits[k] = splits[i] ;
|
|
342 for ( ; j < cnt ; ++j, ++k )
|
|
343 sortedSplits[k] = splits[j] ;
|
|
344 splits = sortedSplits ;
|
|
345
|
|
346 k = 0 ;
|
|
347 for ( i = 1 ; i < cnt ; ++i )
|
|
348 {
|
|
349 if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType
|
|
350 && splits[i].strand == splits[k].strand )
|
|
351 {
|
|
352 splits[k].weight += splits[i].weight ;
|
|
353 }
|
|
354 else
|
|
355 {
|
|
356 ++k ;
|
|
357 splits[k] = splits[i] ;
|
|
358 }
|
|
359 }
|
|
360 splits.resize( k + 1 ) ;
|
|
361 }
|
|
362
|
|
363 void CoalesceDifferentStrandSubexonSplits( std::vector<struct _subexonSplit> &splits )
|
|
364 {
|
|
365 int i, j, k, l ;
|
|
366 int cnt = splits.size() ;
|
|
367 k = 0 ;
|
|
368 for ( i = 0 ; i < cnt ; )
|
|
369 {
|
|
370 for ( j = i + 1 ; j < cnt ; ++j )
|
|
371 {
|
|
372 if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType )
|
|
373 continue ;
|
|
374 break ;
|
|
375 }
|
|
376
|
|
377 int maxWeight = -1 ;
|
|
378 int weightSum = 0 ;
|
|
379 int strand = splits[i].strand ;
|
|
380 for ( l = i ; l < j ; ++l )
|
|
381 {
|
|
382 weightSum += splits[l].weight ;
|
|
383 if ( splits[l].strand != 0 && splits[l].weight > maxWeight )
|
|
384 {
|
|
385 strand = splits[l].strand ;
|
|
386 maxWeight = splits[l].weight ;
|
|
387 }
|
|
388 }
|
|
389 //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ;
|
|
390 splits[k] = splits[i] ;
|
|
391 splits[k].strand = strand ;
|
|
392 splits[k].weight = weightSum ;
|
|
393 ++k ;
|
|
394
|
|
395 i = j ;
|
|
396 }
|
|
397 splits.resize( k ) ;
|
|
398 }
|
|
399
|
|
400
|
|
401 void CoalesceIntervals( std::vector<struct _interval> &intervals )
|
|
402 {
|
|
403 int i, k ;
|
|
404 std::sort( intervals.begin(), intervals.end(), CompInterval ) ;
|
|
405 int cnt = intervals.size() ;
|
|
406 k = 0 ;
|
|
407 for ( i = 1 ; i < cnt ; ++i )
|
|
408 {
|
|
409 if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end )
|
|
410 intervals[k].sampleSupport += intervals[i].sampleSupport ;
|
|
411 else
|
|
412 {
|
|
413 ++k ;
|
|
414 intervals[k] = intervals[i] ;
|
|
415 }
|
|
416 }
|
|
417 intervals.resize( k + 1 ) ;
|
|
418 }
|
|
419
|
|
420 void CleanIntervalIrOverhang( std::vector<struct _interval> &irOverhang )
|
|
421 {
|
|
422 int i, j, k ;
|
|
423 std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ;
|
|
424
|
|
425 int cnt = irOverhang.size() ;
|
|
426 for ( i = 0 ; i < cnt ; ++i )
|
|
427 {
|
|
428 if ( irOverhang[i].start == -1 )
|
|
429 continue ;
|
|
430
|
|
431
|
|
432 // locate the longest interval start at the same coordinate.
|
|
433 int tag = i ;
|
|
434
|
|
435 for ( j = i + 1 ; j < cnt ; ++j )
|
|
436 {
|
|
437 if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start )
|
|
438 break ;
|
|
439 if ( irOverhang[j].start == -1 )
|
|
440 continue ;
|
|
441 tag = j ;
|
|
442 }
|
|
443
|
|
444 for ( k = i ; k < tag ; ++k )
|
|
445 {
|
|
446 irOverhang[k].start = -1 ;
|
|
447 }
|
|
448
|
|
449 for ( k = tag + 1 ; k < cnt ; ++k )
|
|
450 {
|
|
451 if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end )
|
|
452 break ;
|
|
453 if ( irOverhang[k].end <= irOverhang[tag].end )
|
|
454 {
|
|
455 irOverhang[k].start = -1 ;
|
|
456 }
|
|
457 }
|
|
458 }
|
|
459
|
|
460 k = 0 ;
|
|
461 for ( i = 0 ; i < cnt ; ++i )
|
|
462 {
|
|
463 if ( irOverhang[i].start == -1 )
|
|
464 continue ;
|
|
465 irOverhang[k] = irOverhang[i] ;
|
|
466 ++k ;
|
|
467 }
|
|
468 irOverhang.resize( k ) ;
|
|
469 }
|
|
470
|
|
471 // Remove the connection that does not match the boundary
|
|
472 // of subexons.
|
|
473 void CleanUpSubexonConnections( std::vector<struct _subexon> &subexons )
|
|
474 {
|
|
475 int seCnt = subexons.size() ;
|
|
476 int i, j, k, m ;
|
|
477 for ( i = 0 ; i < seCnt ; ++i )
|
|
478 {
|
|
479 if ( subexons[i].prevCnt > 0 )
|
|
480 {
|
|
481 for ( k = i ; k >= 0 ; --k )
|
|
482 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] )
|
|
483 break ;
|
|
484 if ( subexons[k].chrId != subexons[i].chrId )
|
|
485 ++k ;
|
|
486 m = 0 ;
|
|
487 for ( j = 0 ; j < subexons[i].prevCnt ; ++j )
|
|
488 {
|
|
489 for ( ; k <= i ; ++k )
|
|
490 if ( subexons[k].end >= subexons[i].prev[j] )
|
|
491 break ;
|
|
492 if ( subexons[k].end == subexons[i].prev[j]
|
|
493 && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) )
|
|
494 {
|
|
495 subexons[i].prev[m] = subexons[i].prev[j] ;
|
|
496 ++m ;
|
|
497 }
|
|
498 }
|
|
499 subexons[i].prevCnt = m ;
|
|
500 }
|
|
501
|
|
502 m = 0 ;
|
|
503 k = i ;
|
|
504 for ( j = 0 ; j < subexons[i].nextCnt ; ++j )
|
|
505 {
|
|
506 for ( ; k < seCnt ; ++k )
|
|
507 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] )
|
|
508 break ;
|
|
509 if ( subexons[k].start == subexons[i].next[j]
|
|
510 && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) )
|
|
511 {
|
|
512 subexons[i].next[m] = subexons[i].next[j] ;
|
|
513 ++m ;
|
|
514 }
|
|
515 }
|
|
516 subexons[i].nextCnt = m ;
|
|
517 }
|
|
518 }
|
|
519
|
|
520 int main( int argc, char *argv[] )
|
|
521 {
|
|
522 int i, j, k ;
|
|
523 FILE *fp ;
|
|
524 std::vector<char *> files ;
|
|
525
|
|
526 Blocks regions ;
|
|
527 Alignments alignments ;
|
|
528
|
|
529 if ( argc == 1 )
|
|
530 {
|
|
531 printf( "%s", usage ) ;
|
|
532 return 0 ;
|
|
533 }
|
|
534
|
|
535 for ( i = 1 ; i < argc ; ++i )
|
|
536 {
|
|
537 if ( !strcmp( argv[i], "-s" ) )
|
|
538 {
|
|
539 files.push_back( argv[i + 1] ) ;
|
|
540 ++i ;
|
|
541 continue ;
|
|
542 }
|
|
543 else if ( !strcmp( argv[i], "--ls" ) )
|
|
544 {
|
|
545 FILE *fpLs = fopen( argv[i + 1], "r" ) ;
|
|
546 char buffer[1024] ;
|
|
547 while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL )
|
|
548 {
|
|
549 int len = strlen( buffer ) ;
|
|
550 if ( buffer[len - 1] == '\n' )
|
|
551 {
|
|
552 buffer[len - 1] = '\0' ;
|
|
553 --len ;
|
|
554
|
|
555 }
|
|
556 char *fileName = strdup( buffer ) ;
|
|
557 files.push_back( fileName ) ;
|
|
558 }
|
|
559 }
|
|
560 }
|
|
561 int fileCnt = files.size() ;
|
|
562 // Obtain the chromosome ids through bam file.
|
|
563 fp = fopen( files[0], "r" ) ;
|
|
564 if ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
|
|
565 {
|
|
566 int len = strlen( buffer ) ;
|
|
567 buffer[len - 1] = '\0' ;
|
|
568 alignments.Open( buffer + 1 ) ;
|
|
569 }
|
|
570 fclose( fp ) ;
|
|
571
|
|
572 // Collect the split sites of subexons.
|
|
573 std::vector<struct _subexonSplit> subexonSplits ;
|
|
574 std::vector<struct _interval> intervalIrOverhang ; // intervals contains ir and overhang.
|
|
575 std::vector<struct _interval> introns ;
|
|
576 std::vector<struct _interval> exons ;
|
|
577
|
|
578
|
|
579 for ( k = 0 ; k < fileCnt ; ++k )
|
|
580 {
|
|
581 fp = fopen( files[k], "r" ) ;
|
|
582 struct _subexon se ;
|
|
583 struct _subexonSplit sp ;
|
|
584 char chrName[50] ;
|
|
585 int origSize = subexonSplits.size() ;
|
|
586 while ( fgets( buffer, sizeof( buffer), fp ) != NULL )
|
|
587 {
|
|
588 if ( buffer[0] == '#' )
|
|
589 continue ;
|
|
590
|
|
591 SubexonGraph::InputSubexon( buffer, alignments, se, false) ;
|
|
592 // Record all the intron rentention, overhang from the samples
|
|
593 if ( ( se.leftType == 2 && se.rightType == 1 )
|
|
594 || ( se.leftType == 2 && se.rightType == 0 )
|
|
595 || ( se.leftType == 0 && se.rightType == 1 ) )
|
|
596 {
|
|
597 struct _interval si ;
|
|
598 si.chrId = se.chrId ;
|
|
599 si.start = se.start ;
|
|
600 si.end = se.end ;
|
|
601
|
|
602 intervalIrOverhang.push_back( si ) ;
|
|
603 }
|
|
604
|
|
605 // Ignore overhang subexons and ir subexons for now.
|
|
606 if ( ( se.leftType == 0 && se.rightType == 1 )
|
|
607 || ( se.leftType == 2 && se.rightType == 0 )
|
|
608 || ( se.leftType == 2 && se.rightType == 1 ) )
|
|
609 continue ;
|
|
610
|
|
611 if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island
|
|
612 continue ;
|
|
613 if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) )
|
|
614 continue ;
|
|
615
|
|
616 if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here.
|
|
617 {
|
|
618 struct _interval ni ;
|
|
619 ni.chrId = se.chrId ;
|
|
620 ni.start = se.start ;
|
|
621 ni.end = se.end ;
|
|
622 ni.strand = se.rightStrand ;
|
|
623 ni.sampleSupport = 1 ;
|
|
624 exons.push_back( ni ) ;
|
|
625 }
|
|
626
|
|
627
|
|
628 /*for ( i = 0 ; i < se.nextCnt ; ++i )
|
|
629 {
|
|
630 struct _interval ni ;
|
|
631 ni.chrId = se.chrId ;
|
|
632 ni.start = se.end ;
|
|
633 ni.end = se.next[i] ;
|
|
634 ni.strand = se.rightStrand ;
|
|
635 ni.sampleSupport = 1 ;
|
|
636 if ( ni.start + 1 < ni.end )
|
|
637 introns.push_back( ni ) ;
|
|
638 }*/
|
|
639
|
|
640 sp.chrId = se.chrId ;
|
|
641 sp.pos = se.start ;
|
|
642 sp.type = 1 ;
|
|
643 sp.splitType = se.leftType ;
|
|
644 sp.strand = se.leftStrand ;
|
|
645 sp.weight = 1 ;
|
|
646 subexonSplits.push_back( sp ) ;
|
|
647
|
|
648 sp.chrId = se.chrId ;
|
|
649 sp.pos = se.end ;
|
|
650 sp.type = 2 ;
|
|
651 sp.splitType = se.rightType ;
|
|
652 sp.strand = se.rightStrand ;
|
|
653 sp.weight = 1 ;
|
|
654 subexonSplits.push_back( sp ) ;
|
|
655
|
|
656 /*if ( se.prevCnt > 0 )
|
|
657 delete[] se.prev ;
|
|
658 if ( se.nextCnt > 0 )
|
|
659 delete[] se.next ;*/
|
|
660 }
|
|
661 CoalesceIntervals( exons ) ;
|
|
662 CoalesceIntervals( introns ) ;
|
|
663 CoalesceSubexonSplits( subexonSplits, origSize ) ;
|
|
664 CleanIntervalIrOverhang( intervalIrOverhang ) ;
|
|
665
|
|
666 fclose( fp ) ;
|
|
667 }
|
|
668
|
|
669 CoalesceDifferentStrandSubexonSplits( subexonSplits ) ;
|
|
670
|
|
671 // Obtain the split sites from the introns.
|
|
672 int intronCnt = introns.size() ;
|
|
673 std::vector<struct _subexonSplit> intronSplits ;
|
|
674 for ( i = 0 ; i < intronCnt ; ++i )
|
|
675 {
|
|
676 /*if ( introns[i].sampleSupport < 0.05 * fileCnt )
|
|
677 {
|
|
678 continue ;
|
|
679 }*/
|
|
680 struct _interval &it = introns[i] ;
|
|
681 struct _subexonSplit sp ;
|
|
682 sp.chrId = it.chrId ;
|
|
683 sp.pos = it.start ;
|
|
684 sp.type = 2 ;
|
|
685 sp.splitType = 2 ;
|
|
686 sp.strand = it.strand ;
|
|
687 intronSplits.push_back( sp ) ;
|
|
688
|
|
689 sp.chrId = it.chrId ;
|
|
690 sp.pos = it.end ;
|
|
691 sp.type = 1 ;
|
|
692 sp.splitType = 1 ;
|
|
693 sp.strand = it.strand ;
|
|
694 intronSplits.push_back( sp ) ;
|
|
695 }
|
|
696
|
|
697 // Pair up the split sites to get subexons
|
|
698 std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ;
|
|
699 //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ;
|
|
700
|
|
701 // Convert the hard boundary to soft boundary if the split sites is filtered from the introns
|
|
702 // Seems NO need to do this now.
|
|
703 int splitCnt = subexonSplits.size() ;
|
|
704 int intronSplitCnt = intronSplits.size() ;
|
|
705 k = 0 ;
|
|
706 //for ( i = 0 ; i < splitCnt ; ++i )
|
|
707 while ( 0 )
|
|
708 {
|
|
709 if ( subexonSplits[i].type != subexonSplits[i].splitType )
|
|
710 continue ;
|
|
711
|
|
712 while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId
|
|
713 || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) )
|
|
714 ++k ;
|
|
715 j = k ;
|
|
716 while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId
|
|
717 && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType )
|
|
718 ++j ;
|
|
719
|
|
720 // the split site is filtered.
|
|
721 if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId ||
|
|
722 intronSplits[j].pos > subexonSplits[i].pos )
|
|
723 {
|
|
724 //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ;
|
|
725 subexonSplits[i].splitType = 0 ;
|
|
726
|
|
727 // Convert the adjacent subexon split.
|
|
728 for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId
|
|
729 && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l )
|
|
730 {
|
|
731 if ( subexonSplits[l].type != subexonSplits[i].type
|
|
732 && subexonSplits[l].splitType == subexonSplits[i].splitType )
|
|
733 {
|
|
734 subexonSplits[l].splitType = 0 ;
|
|
735 }
|
|
736 }
|
|
737
|
|
738 // And the other direction
|
|
739 for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId
|
|
740 && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l )
|
|
741 {
|
|
742 if ( subexonSplits[l].type != subexonSplits[i].type
|
|
743 && subexonSplits[l].splitType == subexonSplits[i].splitType )
|
|
744 {
|
|
745 subexonSplits[l].splitType = 0 ;
|
|
746 }
|
|
747 }
|
|
748 }
|
|
749 }
|
|
750 intronSplits.clear() ;
|
|
751 std::vector<struct _subexonSplit>().swap( intronSplits ) ;
|
|
752
|
|
753 // Force the soft boundary that collides with hard boundaries to be hard boundary.
|
|
754 for ( i = 0 ; i < splitCnt ; ++i )
|
|
755 {
|
|
756 if ( subexonSplits[i].splitType != 0 )
|
|
757 continue ;
|
|
758 int newSplitType = 0 ;
|
|
759 int newStrand = subexonSplits[i].strand ;
|
|
760 for ( j = i + 1 ; j < splitCnt ; ++j )
|
|
761 {
|
|
762 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos ||
|
|
763 subexonSplits[i].chrId != subexonSplits[j].chrId )
|
|
764 break ;
|
|
765 if ( subexonSplits[j].splitType != 0 )
|
|
766 {
|
|
767 newSplitType = subexonSplits[j].splitType ;
|
|
768 newStrand = subexonSplits[j].strand ;
|
|
769 break ;
|
|
770 }
|
|
771 }
|
|
772
|
|
773 if ( newSplitType == 0 )
|
|
774 {
|
|
775 for ( j = i - 1 ; j >= 0 ; --j )
|
|
776 {
|
|
777 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos ||
|
|
778 subexonSplits[i].chrId != subexonSplits[j].chrId )
|
|
779 break ;
|
|
780 if ( subexonSplits[j].splitType != 0 )
|
|
781 {
|
|
782 newSplitType = subexonSplits[j].splitType ;
|
|
783 newStrand = subexonSplits[j].strand ;
|
|
784 break ;
|
|
785 }
|
|
786 }
|
|
787
|
|
788 }
|
|
789 /*if ( subexonSplits[i].pos == 154464157 )
|
|
790 {
|
|
791 printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ;
|
|
792 }*/
|
|
793 subexonSplits[i].splitType = newSplitType ;
|
|
794 subexonSplits[i].strand = newStrand ;
|
|
795 }
|
|
796
|
|
797 /*for ( i = 0 ; i < splitCnt ; ++i )
|
|
798 {
|
|
799 printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ;
|
|
800 }*/
|
|
801
|
|
802 // Build subexons from the collected split sites.
|
|
803
|
|
804 std::vector<struct _subexon> subexons ;
|
|
805 int diffCnt = 0 ; // |start of subexon split| - |end of subexon split|
|
|
806 int seCnt = 0 ;
|
|
807 for ( i = 0 ; i < splitCnt - 1 ; ++i )
|
|
808 {
|
|
809 struct _subexon se ;
|
|
810 /*if ( subexonSplits[i + 1].pos == 144177260 )
|
|
811 {
|
|
812 printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType,
|
|
813 subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ;
|
|
814 }*/
|
|
815
|
|
816 if ( subexonSplits[i].type == 1 )
|
|
817 diffCnt += subexonSplits[i].weight ;
|
|
818 else
|
|
819 diffCnt -= subexonSplits[i].weight ;
|
|
820
|
|
821 if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId )
|
|
822 {
|
|
823 diffCnt = 0 ;
|
|
824 continue ;
|
|
825 }
|
|
826
|
|
827 if ( diffCnt == 0 ) // the interval between subexon
|
|
828 continue ;
|
|
829
|
|
830 se.chrId = subexonSplits[i].chrId ;
|
|
831 se.start = subexonSplits[i].pos ;
|
|
832 se.leftType = subexonSplits[i].splitType ;
|
|
833 se.leftStrand = subexonSplits[i].strand ;
|
|
834 if ( subexonSplits[i].type == 2 )
|
|
835 {
|
|
836 se.leftStrand = 0 ;
|
|
837 ++se.start ;
|
|
838 }
|
|
839
|
|
840 se.end = subexonSplits[i + 1].pos ;
|
|
841 se.rightType = subexonSplits[i + 1].splitType ;
|
|
842 se.rightStrand = subexonSplits[i + 1].strand ;
|
|
843 if ( subexonSplits[i + 1].type == 1 )
|
|
844 {
|
|
845 se.rightStrand = 0 ;
|
|
846 --se.end ;
|
|
847 }
|
|
848
|
|
849 /*if ( se.end == 24613649 )
|
|
850 {
|
|
851 //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType,
|
|
852 // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ;
|
|
853 printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ;
|
|
854 }*/
|
|
855
|
|
856 if ( se.start > se.end ) //Note: this handles the case of repeated subexon split.
|
|
857 {
|
|
858 // handle the case in sample 0: [...[..]
|
|
859 // in sample 1: [..]...]
|
|
860 if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType )
|
|
861 {
|
|
862 subexons[seCnt - 1].rightType = se.rightType ;
|
|
863 subexons[seCnt - 1].rightStrand = se.rightStrand ;
|
|
864 }
|
|
865 continue ;
|
|
866 }
|
|
867 se.leftClassifier = se.rightClassifier = 0 ;
|
|
868 se.lcCnt = se.rcCnt = 0 ;
|
|
869
|
|
870 /*if ( 1 ) //se.chrId == 25 )
|
|
871 {
|
|
872 printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ;
|
|
873 }*/
|
|
874
|
|
875
|
|
876 se.next = se.prev = NULL ;
|
|
877 se.nextCnt = se.prevCnt = 0 ;
|
|
878 subexons.push_back( se ) ;
|
|
879 ++seCnt ;
|
|
880 }
|
|
881 subexonSplits.clear() ;
|
|
882 std::vector<struct _subexonSplit>().swap( subexonSplits ) ;
|
|
883
|
|
884 // Adjust the split type.
|
|
885 seCnt = subexons.size() ;
|
|
886 for ( i = 1 ; i < seCnt ; ++i )
|
|
887 {
|
|
888 if ( subexons[i - 1].end + 1 == subexons[i].start )
|
|
889 {
|
|
890 if ( subexons[i - 1].rightType == 0 )
|
|
891 subexons[i - 1].rightType = subexons[i].leftType ;
|
|
892 if ( subexons[i].leftType == 0 )
|
|
893 subexons[i].leftType = subexons[i - 1].rightType ;
|
|
894 }
|
|
895 }
|
|
896
|
|
897 // Merge the adjacent soft boundaries
|
|
898 std::vector<struct _subexon> rawSubexons = subexons ;
|
|
899 int exonCnt = exons.size() ;
|
|
900 subexons.clear() ;
|
|
901
|
|
902 k = 0 ; // hold index for exon.
|
|
903 for ( i = 0 ; i < seCnt ; )
|
|
904 {
|
|
905 /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0
|
|
906 && rawSubexons[k].end + 1 == rawSubexons[i].start )
|
|
907 {
|
|
908 rawSubexons[k].end = rawSubexons[i].end ;
|
|
909 rawSubexons[k].rightType = rawSubexons[i].rightType ;
|
|
910 rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ;
|
|
911 }
|
|
912 else
|
|
913 {
|
|
914 subexons.push_back( rawSubexons[k] ) ;
|
|
915 k = i ;
|
|
916 }*/
|
|
917
|
|
918 while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId
|
|
919 || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) )
|
|
920 ++k ;
|
|
921
|
|
922 for ( j = i + 1 ; j < seCnt ; ++j )
|
|
923 {
|
|
924 if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0
|
|
925 || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start )
|
|
926 || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) )
|
|
927 break ;
|
|
928 }
|
|
929 // rawsubexons[i...j-1] will be merged.
|
|
930
|
|
931 /*if ( rawSubexons[i].start == 119625875 )
|
|
932 {
|
|
933 printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType,
|
|
934 rawSubexons[j].start, rawSubexons[j].leftType ) ;
|
|
935 }*/
|
|
936 bool merge = true ;
|
|
937 if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1
|
|
938 && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 )
|
|
939 {
|
|
940 merge = false ;
|
|
941 int sampleSupport = 0 ;
|
|
942 for ( int l = k ; l < exonCnt ; ++l )
|
|
943 {
|
|
944 if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start )
|
|
945 break ;
|
|
946 if ( exons[l].end == rawSubexons[j - 1].end )
|
|
947 {
|
|
948 merge = true ;
|
|
949 sampleSupport = exons[l].sampleSupport ;
|
|
950 break ;
|
|
951 }
|
|
952 }
|
|
953
|
|
954 if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 )
|
|
955 {
|
|
956 if ( sampleSupport <= 0.2 * fileCnt )
|
|
957 {
|
|
958 merge = false ;
|
|
959 }
|
|
960 }
|
|
961
|
|
962 if ( merge == false )
|
|
963 {
|
|
964 if ( j - i >= 3 )
|
|
965 {
|
|
966 rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ;
|
|
967 rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ;
|
|
968 }
|
|
969
|
|
970 if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start )
|
|
971 {
|
|
972 --rawSubexons[i].end ;
|
|
973 ++rawSubexons[j - 1].start ;
|
|
974 }
|
|
975 subexons.push_back( rawSubexons[i] ) ;
|
|
976 subexons.push_back( rawSubexons[j - 1] ) ;
|
|
977 }
|
|
978 }
|
|
979
|
|
980 if ( merge )
|
|
981 {
|
|
982 rawSubexons[i].end = rawSubexons[j - 1].end ;
|
|
983 rawSubexons[i].rightType = rawSubexons[j - 1].rightType ;
|
|
984 rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ;
|
|
985
|
|
986 if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 )
|
|
987 {
|
|
988 rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ;
|
|
989 }
|
|
990 else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 )
|
|
991 {
|
|
992 rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ;
|
|
993 }
|
|
994
|
|
995 subexons.push_back( rawSubexons[i] ) ;
|
|
996 }
|
|
997
|
|
998 i = j ;
|
|
999 }
|
|
1000 exons.clear() ;
|
|
1001 std::vector<struct _interval>().swap( exons ) ;
|
|
1002
|
|
1003 // Remove overhang, ir subexons intron created after putting multiple sample to gether.
|
|
1004 // eg: s0: [......)
|
|
1005 // s1: [...]--------[....]
|
|
1006 // s2: [...]..)-----[....]
|
|
1007 // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1.
|
|
1008 // But be careful about how to compute the classifier for the overhang part contributed from s0.
|
|
1009 // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here.
|
|
1010 // eg: s0:[...]-----[...]
|
|
1011 // s1: (.)
|
|
1012 // s2:[.............]
|
|
1013 // After merge adjacent soft boundaries, the single-exon island will disappear.
|
|
1014 rawSubexons = subexons ;
|
|
1015 seCnt = subexons.size() ;
|
|
1016 subexons.clear() ;
|
|
1017 for ( i = 0 ; i < seCnt ; ++i )
|
|
1018 {
|
|
1019 if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir
|
|
1020 || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang
|
|
1021 || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) )
|
|
1022 continue ;
|
|
1023 subexons.push_back( rawSubexons[i] ) ;
|
|
1024 }
|
|
1025
|
|
1026 // Remove the single-exon island if it overlaps with intron retentioned or overhang.
|
|
1027 rawSubexons = subexons ;
|
|
1028 seCnt = subexons.size() ;
|
|
1029 subexons.clear() ;
|
|
1030 k = 0 ;
|
|
1031 std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ;
|
|
1032 int irOverhangCnt = intervalIrOverhang.size() ;
|
|
1033
|
|
1034 for ( i = 0 ; i < seCnt ; ++i )
|
|
1035 {
|
|
1036 if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 )
|
|
1037 {
|
|
1038 subexons.push_back( rawSubexons[i] ) ;
|
|
1039 continue ;
|
|
1040 }
|
|
1041
|
|
1042 while ( k < irOverhangCnt )
|
|
1043 {
|
|
1044 // Locate the interval that before the island
|
|
1045 if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId
|
|
1046 || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) )
|
|
1047 {
|
|
1048 ++k ;
|
|
1049 continue ;
|
|
1050 }
|
|
1051 break ;
|
|
1052 }
|
|
1053 bool overlap = false ;
|
|
1054 for ( j = k ; j < irOverhangCnt ; ++j )
|
|
1055 {
|
|
1056 if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end )
|
|
1057 break ;
|
|
1058 if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start )
|
|
1059 || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) )
|
|
1060 {
|
|
1061 overlap = true ;
|
|
1062 break ;
|
|
1063 }
|
|
1064 }
|
|
1065
|
|
1066 if ( !overlap )
|
|
1067 subexons.push_back( rawSubexons[i] ) ;
|
|
1068 }
|
|
1069 rawSubexons.clear() ;
|
|
1070 std::vector<struct _subexon>().swap( rawSubexons ) ;
|
|
1071
|
|
1072 intervalIrOverhang.clear() ;
|
|
1073 std::vector<struct _interval>().swap( intervalIrOverhang ) ;
|
|
1074
|
|
1075 // Create the dummy intervals.
|
|
1076 seCnt = subexons.size() ;
|
|
1077 std::vector<struct _intronicInfo> intronicInfos ;
|
|
1078 std::vector<struct _seInterval> seIntervals ;
|
|
1079 std::vector<struct _subexonSupplement> subexonInfo ;
|
|
1080
|
|
1081 //subexonInfo.resize( seCnt ) ;
|
|
1082 for ( i = 0 ; i < seCnt ; ++i )
|
|
1083 {
|
|
1084 struct _seInterval ni ; // new interval
|
|
1085 ni.start = subexons[i].start ;
|
|
1086 ni.end = subexons[i].end ;
|
|
1087 ni.type = 0 ;
|
|
1088 ni.idx = i ;
|
|
1089 ni.chrId = subexons[i].chrId ;
|
|
1090 seIntervals.push_back( ni ) ;
|
|
1091
|
|
1092 /*if ( subexons[i].end == 42671717 )
|
|
1093 {
|
|
1094 printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ;
|
|
1095 }*/
|
|
1096 //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ;
|
|
1097
|
|
1098 /*int nexti ;
|
|
1099 for ( nexti = i + 1 ; nexti < seCnt ; ++nexti )
|
|
1100 if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/
|
|
1101
|
|
1102 if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId &&
|
|
1103 subexons[i].end + 1 < subexons[i + 1].start &&
|
|
1104 subexons[i].rightType + subexons[i + 1].leftType != 0 )
|
|
1105 {
|
|
1106 // Only consider the intervals like ]..[,]...(, )...[
|
|
1107 // The case like ]...] is actaully things like ][...] in subexon perspective,
|
|
1108 // so they won't pass the if-statement
|
|
1109 struct _intronicInfo nii ; // new intronic info
|
|
1110 ni.start = subexons[i].end + 1 ;
|
|
1111 ni.end = subexons[i + 1].start - 1 ;
|
|
1112 ni.type = 1 ;
|
|
1113 ni.idx = intronicInfos.size() ;
|
|
1114 seIntervals.push_back( ni ) ;
|
|
1115
|
|
1116 nii.chrId = subexons[i].chrId ;
|
|
1117 nii.start = ni.start ;
|
|
1118 nii.end = ni.end ;
|
|
1119 nii.leftSubexonIdx = i ;
|
|
1120 nii.rightSubexonIdx = i + 1 ;
|
|
1121 nii.irClassifier = 0 ;
|
|
1122 nii.irCnt = 0 ;
|
|
1123 nii.validIrCnt = 0 ;
|
|
1124 nii.leftOverhang.cnt = 0 ;
|
|
1125 nii.leftOverhang.validCnt = 0 ;
|
|
1126 nii.leftOverhang.length = 0 ;
|
|
1127 nii.leftOverhang.classifier = 0 ;
|
|
1128 nii.rightOverhang.cnt = 0 ;
|
|
1129 nii.rightOverhang.validCnt = 0 ;
|
|
1130 nii.rightOverhang.length = 0 ;
|
|
1131 nii.rightOverhang.classifier = 0 ;
|
|
1132 intronicInfos.push_back( nii ) ;
|
|
1133 /*if ( nii.end == 23667 )
|
|
1134 {
|
|
1135 printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ;
|
|
1136 }*/
|
|
1137 }
|
|
1138 }
|
|
1139
|
|
1140 // Go through all the files to get some statistics number
|
|
1141 double avgIrPiRatio = 0 ;
|
|
1142 double avgIrPiCov = 0 ;
|
|
1143 double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results
|
|
1144 double irPiCov, irKCov[2], irThetaCov[2] ;
|
|
1145
|
|
1146 double avgOverhangPiRatio = 0 ;
|
|
1147 double avgOverhangPiCov = 0 ;
|
|
1148 double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results
|
|
1149 double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ;
|
|
1150
|
|
1151 for ( k = 0 ; k < fileCnt ; ++k )
|
|
1152 {
|
|
1153 fp = fopen( files[k], "r" ) ;
|
|
1154
|
|
1155 while ( fgets( buffer, sizeof( buffer), fp ) != NULL )
|
|
1156 {
|
|
1157 if ( buffer[0] == '#' )
|
|
1158 {
|
|
1159 char buffer2[100] ;
|
|
1160 sscanf( buffer, "%s", buffer2 ) ;
|
|
1161 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) )
|
|
1162 {
|
|
1163 // TODO: ignore certain samples if the coverage seems wrong.
|
|
1164 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
|
|
1165 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0],
|
|
1166 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ;
|
|
1167 avgIrPiRatio += irPiRatio ;
|
|
1168 }
|
|
1169 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) )
|
|
1170 {
|
|
1171 }
|
|
1172 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) )
|
|
1173 {
|
|
1174 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
|
|
1175 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0],
|
|
1176 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ;
|
|
1177 avgOverhangPiRatio += overhangPiRatio ;
|
|
1178 }
|
|
1179 }
|
|
1180 else
|
|
1181 break ;
|
|
1182 }
|
|
1183 fclose( fp ) ;
|
|
1184 }
|
|
1185 avgIrPiRatio /= fileCnt ;
|
|
1186 avgOverhangPiRatio /= fileCnt ;
|
|
1187
|
|
1188 // Go through all the files to put statistical results into each subexon.
|
|
1189 std::vector< struct _subexon > sampleSubexons ;
|
|
1190 int subexonCnt = subexons.size() ;
|
|
1191 for ( k = 0 ; k < fileCnt ; ++k )
|
|
1192 {
|
|
1193 //if ( k == 220 )
|
|
1194 // exit( 1 ) ;
|
|
1195 fp = fopen( files[k], "r" ) ;
|
|
1196 struct _subexon se ;
|
|
1197 struct _subexonSplit sp ;
|
|
1198 char chrName[50] ;
|
|
1199
|
|
1200 sampleSubexons.clear() ;
|
|
1201
|
|
1202 int tag = 0 ;
|
|
1203 while ( fgets( buffer, sizeof( buffer), fp ) != NULL )
|
|
1204 {
|
|
1205 if ( buffer[0] == '#' )
|
|
1206 {
|
|
1207 char buffer2[200] ;
|
|
1208 sscanf( buffer, "%s", buffer2 ) ;
|
|
1209 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) )
|
|
1210 {
|
|
1211 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
|
|
1212 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0],
|
|
1213 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ;
|
|
1214 }
|
|
1215 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) )
|
|
1216 {
|
|
1217 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
|
|
1218 buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0],
|
|
1219 buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ;
|
|
1220 }
|
|
1221 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) )
|
|
1222 {
|
|
1223 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
|
|
1224 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0],
|
|
1225 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ;
|
|
1226 }
|
|
1227 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) )
|
|
1228 {
|
|
1229 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
|
|
1230 buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0],
|
|
1231 buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ;
|
|
1232 }
|
|
1233 continue ;
|
|
1234 }
|
|
1235 else
|
|
1236 break ;
|
|
1237
|
|
1238 //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ;
|
|
1239 //sampleSubexons.push_back( se ) ;
|
|
1240 }
|
|
1241
|
|
1242 //int sampleSubexonCnt = sampleSubexons.size() ;
|
|
1243 int intervalCnt = seIntervals.size() ;
|
|
1244 //for ( i = 0 ; i < sampleSubexonCnt ; ++i )
|
|
1245 int iterCnt = 0 ;
|
|
1246 while ( 1 )
|
|
1247 {
|
|
1248 if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL)
|
|
1249 break ;
|
|
1250 ++iterCnt ;
|
|
1251
|
|
1252 struct _subexon se ;
|
|
1253 SubexonGraph::InputSubexon( buffer, alignments, se, true ) ;
|
|
1254
|
|
1255 while ( tag < intervalCnt )
|
|
1256 {
|
|
1257 if ( seIntervals[tag].chrId < se.chrId ||
|
|
1258 ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) )
|
|
1259 {
|
|
1260 ++tag ;
|
|
1261 continue ;
|
|
1262 }
|
|
1263 else
|
|
1264 break ;
|
|
1265 }
|
|
1266
|
|
1267 for ( j = tag ; j < intervalCnt ; ++j )
|
|
1268 {
|
|
1269 if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap.
|
|
1270 break ;
|
|
1271 int idx ;
|
|
1272
|
|
1273 if ( seIntervals[j].type == 0 )
|
|
1274 {
|
|
1275 idx = seIntervals[j].idx ;
|
|
1276 if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start )
|
|
1277 {
|
|
1278 double tmp = se.leftClassifier ;
|
|
1279 if ( se.leftClassifier == 0 )
|
|
1280 tmp = 1e-7 ;
|
|
1281 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ;
|
|
1282 ++subexons[idx].lcCnt ;
|
|
1283 subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt,
|
|
1284 se.prev, se.prevCnt, subexons[idx].prevCnt ) ;
|
|
1285
|
|
1286 if ( se.rightType == 0 ) // a gene end here
|
|
1287 {
|
|
1288 for ( int l = idx ; l < subexonCnt ; ++l )
|
|
1289 {
|
|
1290 if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1
|
|
1291 || subexons[l].chrId != subexons[l - 1].chrId ) )
|
|
1292 break ;
|
|
1293 if ( subexons[l].rightType == 2 )
|
|
1294 {
|
|
1295 double adjustAvgDepth = se.avgDepth ;
|
|
1296 if ( se.end - se.start + 1 >= 100 )
|
|
1297 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ;
|
|
1298 else
|
|
1299 adjustAvgDepth *= 2 ;
|
|
1300 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ;
|
|
1301 //if ( se.end - se.start + 1 >= 500 && p > 0.001 )
|
|
1302 // p = 0.001 ;
|
|
1303
|
|
1304 subexons[l].rightClassifier -= 2.0 * log( p ) ;
|
|
1305 ++subexons[l].rcCnt ;
|
|
1306 break ;
|
|
1307 }
|
|
1308 }
|
|
1309 }
|
|
1310 }
|
|
1311 //if ( se.chrId == 25 )
|
|
1312 // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ;
|
|
1313 if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end )
|
|
1314 {
|
|
1315 double tmp = se.rightClassifier ;
|
|
1316 if ( se.rightClassifier == 0 )
|
|
1317 tmp = 1e-7 ;
|
|
1318 subexons[idx].rightClassifier -= 2.0 * log( tmp ) ;
|
|
1319 ++subexons[idx].rcCnt ;
|
|
1320
|
|
1321 subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt,
|
|
1322 se.next, se.nextCnt, subexons[idx].nextCnt ) ;
|
|
1323
|
|
1324 if ( se.leftType == 0 )
|
|
1325 {
|
|
1326 for ( int l = idx ; l >= 0 ; --l )
|
|
1327 {
|
|
1328 if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1
|
|
1329 || subexons[l].chrId != subexons[l + 1].chrId ) )
|
|
1330 break ;
|
|
1331 if ( subexons[l].leftType == 1 )
|
|
1332 {
|
|
1333 double adjustAvgDepth = se.avgDepth ;
|
|
1334 if ( se.end - se.start + 1 >= 100 )
|
|
1335 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ;
|
|
1336 else
|
|
1337 adjustAvgDepth *= 2 ;
|
|
1338 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ;
|
|
1339 //if ( se.end - se.start + 1 >= 500 && p >= 0.001 )
|
|
1340 // p = 0.001 ;
|
|
1341 subexons[l].leftClassifier -= 2.0 * log( p ) ;
|
|
1342 ++subexons[l].lcCnt ;
|
|
1343 break ;
|
|
1344 }
|
|
1345 }
|
|
1346 }
|
|
1347 }
|
|
1348
|
|
1349 if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0
|
|
1350 && se.leftType == 0 && se.rightType == 0 ) // the single-exon island.
|
|
1351 {
|
|
1352 double tmp = se.leftClassifier ;
|
|
1353 if ( se.leftClassifier == 0 )
|
|
1354 tmp = 1e-7 ;
|
|
1355 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ;
|
|
1356 subexons[idx].rightClassifier = subexons[idx].leftClassifier ;
|
|
1357 ++subexons[idx].lcCnt ;
|
|
1358 ++subexons[idx].rcCnt ;
|
|
1359 }
|
|
1360 }
|
|
1361 else if ( seIntervals[j].type == 1 )
|
|
1362 {
|
|
1363 idx = seIntervals[j].idx ;
|
|
1364 // Overlap on the left part of intron
|
|
1365 if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end
|
|
1366 && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 )
|
|
1367 {
|
|
1368 int len = se.end - intronicInfos[idx].start + 1 ;
|
|
1369 intronicInfos[idx].leftOverhang.length += len ;
|
|
1370 ++intronicInfos[idx].leftOverhang.cnt ;
|
|
1371
|
|
1372 // Note that the sample subexon must have a soft boundary at right hand side,
|
|
1373 // otherwise, this part is not an intron and won't show up in intronic Info.
|
|
1374 if ( se.leftType == 2 )
|
|
1375 {
|
|
1376 if ( se.leftRatio > 0 && se.avgDepth > 1 )
|
|
1377 {
|
|
1378 ++intronicInfos[idx].leftOverhang.validCnt ;
|
|
1379
|
|
1380 double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth,
|
|
1381 overhangPiRatio, overhangKRatio, overhangThetaRatio,
|
|
1382 overhangPiCov, overhangKCov, overhangThetaCov, false ) ;
|
|
1383 intronicInfos[idx].leftOverhang.classifier += update ;
|
|
1384 }
|
|
1385 }
|
|
1386 else if ( se.leftType == 1 )
|
|
1387 {
|
|
1388 ++intronicInfos[idx].leftOverhang.validCnt ;
|
|
1389 double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth,
|
|
1390 overhangPiRatio, overhangKRatio, overhangThetaRatio,
|
|
1391 overhangPiCov, overhangKCov, overhangThetaCov, true ) ;
|
|
1392 intronicInfos[idx].leftOverhang.classifier += update ;
|
|
1393
|
|
1394 int seIdx = intronicInfos[idx].leftSubexonIdx ;
|
|
1395 subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ;
|
|
1396 ++subexons[ seIdx ].rcCnt ;
|
|
1397 }
|
|
1398 // ignore the contribution of single-exon island here?
|
|
1399 }
|
|
1400 // Overlap on the right part of intron
|
|
1401 else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end
|
|
1402 && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 )
|
|
1403 {
|
|
1404 int len = intronicInfos[idx].end - se.start + 1 ;
|
|
1405 intronicInfos[idx].rightOverhang.length += len ;
|
|
1406 ++intronicInfos[idx].rightOverhang.cnt ;
|
|
1407
|
|
1408 // Note that the sample subexon must have a soft boundary at left hand side,
|
|
1409 // otherwise, this won't show up in intronic Info
|
|
1410 if ( se.rightType == 1 )
|
|
1411 {
|
|
1412 if ( se.rightRatio > 0 && se.avgDepth > 1 )
|
|
1413 {
|
|
1414 ++intronicInfos[idx].rightOverhang.validCnt ;
|
|
1415
|
|
1416 double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth,
|
|
1417 overhangPiRatio, overhangKRatio, overhangThetaRatio,
|
|
1418 overhangPiCov, overhangKCov, overhangThetaCov, false ) ;
|
|
1419 intronicInfos[idx].rightOverhang.classifier += update ;
|
|
1420 }
|
|
1421 }
|
|
1422 else if ( se.rightType == 2 )
|
|
1423 {
|
|
1424 ++intronicInfos[idx].rightOverhang.validCnt ;
|
|
1425
|
|
1426 double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth,
|
|
1427 overhangPiRatio, overhangKRatio, overhangThetaRatio,
|
|
1428 overhangPiCov, overhangKCov, overhangThetaCov, true ) ;
|
|
1429 intronicInfos[idx].rightOverhang.classifier += update ;
|
|
1430
|
|
1431 int seIdx = intronicInfos[idx].rightSubexonIdx ;
|
|
1432 /*if ( subexons[ seIdx ].start == 6873648 )
|
|
1433 {
|
|
1434 printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ;
|
|
1435 }*/
|
|
1436 subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ;
|
|
1437 ++subexons[ seIdx ].lcCnt ;
|
|
1438 }
|
|
1439 }
|
|
1440 // Intron is fully contained in this sample subexon, then it is a ir candidate
|
|
1441 else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end )
|
|
1442 {
|
|
1443 if ( se.leftType == 2 && se.rightType == 1 )
|
|
1444 {
|
|
1445 double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ;
|
|
1446 ++intronicInfos[idx].irCnt ;
|
|
1447 if ( ratio > 0 && se.avgDepth > 1 )
|
|
1448 {
|
|
1449 double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth,
|
|
1450 irPiRatio, irKRatio, irThetaRatio,
|
|
1451 irPiCov, irKCov, irThetaCov, true ) ;
|
|
1452 //if ( intronicInfos[idx].start == 37617368 )
|
|
1453 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ;
|
|
1454 intronicInfos[idx].irClassifier += update ;
|
|
1455 ++intronicInfos[idx].validIrCnt ;
|
|
1456 }
|
|
1457 }
|
|
1458 else if ( se.leftType == 1 || se.rightType == 2 )
|
|
1459 {
|
|
1460 //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] )
|
|
1461 // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ;
|
|
1462 /*if ( se.start == 37617368 )
|
|
1463 {
|
|
1464 printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ),
|
|
1465 MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ;
|
|
1466 }*/
|
|
1467 if ( se.avgDepth > 1 )
|
|
1468 {
|
|
1469 // let the depth be the threshold to determine.
|
|
1470 double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth,
|
|
1471 irPiRatio, irKRatio, irThetaRatio,
|
|
1472 irPiCov, irKCov, irThetaCov, true ) ;
|
|
1473 //if ( intronicInfos[idx].start == 36266630 )
|
|
1474 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ;
|
|
1475 intronicInfos[idx].irClassifier += update ;
|
|
1476 ++intronicInfos[idx].irCnt ;
|
|
1477 ++intronicInfos[idx].validIrCnt ;
|
|
1478 }
|
|
1479 }
|
|
1480 else
|
|
1481 {
|
|
1482 // the intron is contained in a overhang subexon from the sample or single-exon island
|
|
1483 }
|
|
1484 }
|
|
1485 // sample subexon is contained in the intron.
|
|
1486 else
|
|
1487 {
|
|
1488 // Do nothing.
|
|
1489 }
|
|
1490 }
|
|
1491 }
|
|
1492
|
|
1493 //if ( se.nextCnt > 0 )
|
|
1494 delete[] se.next ;
|
|
1495 //if ( se.prevCnt > 0 )
|
|
1496 delete[] se.prev ;
|
|
1497 }
|
|
1498 fclose( fp ) ;
|
|
1499
|
|
1500 /*for ( i = 0 ; i < sampleSubexonCnt ; ++i )
|
|
1501 {
|
|
1502 if ( sampleSubexons[i].nextCnt > 0 )
|
|
1503 delete[] sampleSubexons[i].next ;
|
|
1504 if ( sampleSubexons[i].prevCnt > 0 )
|
|
1505 delete[] sampleSubexons[i].prev ;
|
|
1506 }*/
|
|
1507 }
|
|
1508
|
|
1509 CleanUpSubexonConnections( subexons ) ;
|
|
1510
|
|
1511 // Convert the temporary statistics number into formal statistics result.
|
|
1512 for ( i = 0 ; i < subexonCnt ; ++i )
|
|
1513 {
|
|
1514 struct _subexon &se = subexons[i] ;
|
|
1515 if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt.
|
|
1516 {
|
|
1517 se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ;
|
|
1518 }
|
|
1519 else
|
|
1520 {
|
|
1521 if ( se.leftType == 1 )
|
|
1522 {
|
|
1523 se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ;
|
|
1524 }
|
|
1525 else
|
|
1526 se.leftClassifier = -1 ;
|
|
1527
|
|
1528 if ( se.rightType == 2 )
|
|
1529 se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ;
|
|
1530 else
|
|
1531 se.rightClassifier = -1 ;
|
|
1532 }
|
|
1533 }
|
|
1534
|
|
1535 int iiCnt = intronicInfos.size() ; //intronicInfo count
|
|
1536 for ( i = 0 ; i < iiCnt ; ++i )
|
|
1537 {
|
|
1538 struct _intronicInfo &ii = intronicInfos[i] ;
|
|
1539 if ( ii.validIrCnt > 0 )
|
|
1540 {
|
|
1541 for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j )
|
|
1542 {
|
|
1543 ii.irClassifier -= log( 10.0 ) ;
|
|
1544 }
|
|
1545 /*if ( ii.validIrCnt < fileCnt * 0.15 )
|
|
1546 ii.irClassifier -= log( 1000.0 ) ;
|
|
1547 else if ( ii.validIrCnt < fileCnt * 0.5 )
|
|
1548 ii.irClassifier -= log( 100.0 ) ;*/
|
|
1549 ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ;
|
|
1550 }
|
|
1551 else
|
|
1552 ii.irClassifier = -1 ;
|
|
1553
|
|
1554 if ( ii.leftOverhang.validCnt > 0 )
|
|
1555 ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier +
|
|
1556 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ;
|
|
1557 else
|
|
1558 ii.leftOverhang.classifier = -1 ;
|
|
1559
|
|
1560 if ( ii.rightOverhang.validCnt > 0 )
|
|
1561 ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier +
|
|
1562 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ;
|
|
1563 else
|
|
1564 ii.rightOverhang.classifier = -1 ;
|
|
1565 }
|
|
1566
|
|
1567 // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier
|
|
1568 // which collide with overhang subexon.
|
|
1569 int intervalCnt = seIntervals.size() ;
|
|
1570 for ( i = 0 ; i < intervalCnt ; ++i )
|
|
1571 {
|
|
1572 if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 )
|
|
1573 {
|
|
1574 int idx = seIntervals[i].idx ;
|
|
1575 if ( intronicInfos[idx].leftOverhang.cnt > 0 )
|
|
1576 {
|
|
1577 int k = seIntervals[i - 1].idx ;
|
|
1578 // Should aim for more conservative?
|
|
1579 if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier )
|
|
1580 subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ;
|
|
1581 }
|
|
1582
|
|
1583 if ( intronicInfos[idx].rightOverhang.cnt > 0 )
|
|
1584 {
|
|
1585 int k = seIntervals[i + 1].idx ;
|
|
1586 if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier )
|
|
1587 subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ;
|
|
1588 }
|
|
1589 }
|
|
1590 }
|
|
1591
|
|
1592 // Output the result.
|
|
1593 for ( i = 0 ; i < intervalCnt ; ++i )
|
|
1594 {
|
|
1595 if ( seIntervals[i].type == 0 )
|
|
1596 {
|
|
1597 struct _subexon &se = subexons[ seIntervals[i].idx ] ;
|
|
1598
|
|
1599 char ls, rs ;
|
|
1600
|
|
1601 ls = StrandNumToSymbol( se.leftStrand ) ;
|
|
1602 rs = StrandNumToSymbol( se.rightStrand ) ;
|
|
1603
|
|
1604 printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end,
|
|
1605 se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ;
|
|
1606 if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId
|
|
1607 && seIntervals[i - 1].end + 1 == seIntervals[i].start
|
|
1608 && !( seIntervals[i - 1].type == 0 &&
|
|
1609 subexons[ seIntervals[i - 1].idx ].rightType != se.leftType )
|
|
1610 && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0
|
|
1611 && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 )
|
|
1612 && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file.
|
|
1613 {
|
|
1614 printf( "%d ", se.prevCnt + 1 ) ;
|
|
1615 for ( j = 0 ; j < se.prevCnt ; ++j )
|
|
1616 printf( "%d ", se.prev[j] ) ;
|
|
1617 printf( "%d ", se.start - 1 ) ;
|
|
1618 }
|
|
1619 else
|
|
1620 {
|
|
1621 printf( "%d ", se.prevCnt ) ;
|
|
1622 for ( j = 0 ; j < se.prevCnt ; ++j )
|
|
1623 printf( "%d ", se.prev[j] ) ;
|
|
1624 }
|
|
1625
|
|
1626 if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId
|
|
1627 && seIntervals[i].end == seIntervals[i + 1].start - 1
|
|
1628 && !( seIntervals[i + 1].type == 0 &&
|
|
1629 subexons[ seIntervals[i + 1].idx ].leftType != se.rightType )
|
|
1630 && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0
|
|
1631 && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 )
|
|
1632 && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) )
|
|
1633 {
|
|
1634 printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ;
|
|
1635 }
|
|
1636 else
|
|
1637 printf( "%d ", se.nextCnt ) ;
|
|
1638 for ( j = 0 ; j < se.nextCnt ; ++j )
|
|
1639 printf( "%d ", se.next[j] ) ;
|
|
1640 printf( "\n" ) ;
|
|
1641 }
|
|
1642 else if ( seIntervals[i].type == 1 )
|
|
1643 {
|
|
1644 struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ;
|
|
1645 if ( ii.irCnt > 0 )
|
|
1646 {
|
|
1647 printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n",
|
|
1648 alignments.GetChromName( ii.chrId ), ii.start, ii.end,
|
|
1649 ii.irClassifier, ii.irClassifier,
|
|
1650 seIntervals[i - 1].end, seIntervals[i + 1].start ) ;
|
|
1651 }
|
|
1652 else
|
|
1653 {
|
|
1654 // left overhang.
|
|
1655 if ( ii.leftOverhang.cnt > 0 )
|
|
1656 {
|
|
1657 printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n",
|
|
1658 alignments.GetChromName( ii.chrId ), ii.start,
|
|
1659 ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1,
|
|
1660 ii.leftOverhang.classifier, ii.leftOverhang.classifier,
|
|
1661 ii.start - 1 ) ;
|
|
1662 }
|
|
1663
|
|
1664 // right overhang.
|
|
1665 if ( ii.rightOverhang.cnt > 0 )
|
|
1666 {
|
|
1667 printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n",
|
|
1668 alignments.GetChromName( ii.chrId ),
|
|
1669 ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end,
|
|
1670 ii.rightOverhang.classifier, ii.rightOverhang.classifier,
|
|
1671 ii.end + 1 ) ;
|
|
1672 }
|
|
1673
|
|
1674 }
|
|
1675 }
|
|
1676 }
|
|
1677
|
|
1678 return 0 ;
|
|
1679 }
|
|
1680
|