comparison PsiCLASS-1.0.2/TranscriptDecider.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include "TranscriptDecider.hpp"
2
3 void TranscriptDecider::OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript )
4 {
5 int i, j ;
6 // determine the strand
7 std::vector<int> subexonInd ;
8 transcript.seVector.GetOnesIndices( subexonInd ) ;
9
10 // Determine the strand
11 char strand[2] = "." ;
12 int size = subexonInd.size() ;
13 if ( size > 1 )
14 {
15 // locate the intron showed up in this transcript.
16 for ( i = 0 ; i < size - 1 ; ++i )
17 {
18 /*int nextCnt = subexons[ subexonInd[i] ].nextCnt ;
19 if ( nextCnt == 0 )
20 continue ;
21
22 for ( j = 0 ; j < nextCnt ; ++j )
23 {
24 int a = subexons[ subexonInd[i] ].next[j] ;
25 if ( subexonInd[i + 1] == a
26 && subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[...
27 {
28 break ;
29 }
30 }
31 if ( j < nextCnt )*/
32
33 if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start )
34 {
35 if ( subexons[ subexonInd[i] ].rightStrand == 1 )
36 strand[0] = '+' ;
37 else if ( subexons[ subexonInd[i] ].rightStrand == -1 )
38 strand[0] = '-' ;
39 break ;
40 }
41 }
42 }
43
44 // TODO: transcript_id
45 char *chrom = alignments.GetChromName( subexons[0].chrId ) ;
46 char prefix[10] = "" ;
47 struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ;
48 // Concatenate adjacent subexons
49 catSubexons[0] = subexons[ subexonInd[0] ] ;
50 j = 1 ;
51 for ( i = 1 ; i < size ; ++i )
52 {
53 if ( subexons[ subexonInd[i] ].start == catSubexons[j - 1].end + 1 )
54 {
55 catSubexons[j - 1].end = subexons[ subexonInd[i] ].end ;
56 }
57 else
58 {
59 catSubexons[j] = subexons[ subexonInd[i] ] ;
60 ++j ;
61 }
62 }
63 size = j ;
64
65 int gid = GetTranscriptGeneId( subexonInd, subexons ) ;
66 if ( 0 ) //numThreads <= 1 )
67 {
68 fprintf( outputFPs[sampleId], "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n",
69 chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand,
70 prefix, chrom, gid,
71 prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.FPKM ) ;
72 for ( i = 0 ; i < size ; ++i )
73 {
74 fprintf( outputFPs[ sampleId ], "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; "
75 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n",
76 chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand,
77 prefix, chrom, gid,
78 prefix, chrom, gid, transcriptId[ gid - baseGeneId ],
79 i + 1, transcript.FPKM ) ;
80 }
81 }
82 else
83 {
84 struct _outputTranscript t ;
85 int len = 0 ;
86 t.chrId = subexons[0].chrId ;
87 t.geneId = gid ;
88 t.transcriptId = transcriptId[ gid - baseGeneId ] ;
89 t.FPKM = transcript.FPKM ;
90 t.sampleId = sampleId ;
91 t.exons = new struct _pair32[size] ;
92 for ( i = 0 ; i < size ; ++i )
93 {
94 t.exons[i].a = catSubexons[i].start + 1 ;
95 t.exons[i].b = catSubexons[i].end + 1 ;
96 len += t.exons[i].b - t.exons[i].a + 1 ;
97 }
98 t.cov = transcript.abundance * alignments.readLen / len ;
99 t.ecnt = size ;
100 t.strand = strand[0] ;
101 //printf( "%lf\n", transcript.correlationScore ) ;
102
103 if ( numThreads > 1 )
104 outputHandler->Add( t ) ;
105 else
106 outputHandler->Add_SingleThread( t ) ;
107 }
108 ++transcriptId[ gid - baseGeneId ] ;
109
110 delete[] catSubexons ;
111 }
112
113 int TranscriptDecider::GetFather( int f, int *father )
114 {
115 if ( father[f] != f )
116 return father[f] = GetFather( father[f], father ) ;
117 return f ;
118 }
119
120 int TranscriptDecider::GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons )
121 {
122 int i ;
123 int size = subexonInd.size() ;
124
125 for ( i = 0 ; i < size ; ++i )
126 if ( subexons[ subexonInd[i] ].geneId != -2 )
127 return subexons[ subexonInd[i] ].geneId ;
128
129 // Some extreme case, where all the regions are mixture regions.
130 for ( i = 0 ; i < size - 1 ; ++i )
131 if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start )
132 {
133 return defaultGeneId[ ( subexons[ subexonInd[i] ].rightStrand + 1 ) / 2 ] ;
134 }
135 return defaultGeneId[0] ;
136 }
137
138 int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons )
139 {
140 if ( subexons[ t.first ].geneId != -2 )
141 return subexons[ t.first ].geneId ;
142 if ( subexons[ t.last ].geneId != -2 )
143 return subexons[ t.last ].geneId ;
144 std::vector<int> subexonInd ;
145 t.seVector.GetOnesIndices( subexonInd ) ;
146 return GetTranscriptGeneId( subexonInd, subexons ) ;
147 }
148
149 void TranscriptDecider::InitTranscriptId()
150 {
151 int i ;
152 for ( i = 0 ; i < usedGeneId - baseGeneId ; ++i )
153 transcriptId[i] = 0 ;
154 }
155
156 bool TranscriptDecider::IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt )
157 {
158 int j, k ;
159 int leftStrandCnt[2] = {0, 0}, rightStrandCnt[2] = {0, 0};
160 for ( j = tag + 1 ; j < seCnt ; ++j )
161 if ( subexons[j].start > subexons[j - 1].end + 1 )
162 break ;
163
164 for ( k = tag ; k < j ; ++k )
165 {
166 if ( subexons[k].leftStrand != 0 )
167 ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
168 if ( subexons[k].rightStrand != 0 )
169 ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
170 }
171
172 if ( rightStrandCnt[0] > 0 && leftStrandCnt[0] == 0 && leftStrandCnt[1] > 0 )
173 return true ;
174 if ( rightStrandCnt[1] > 0 && leftStrandCnt[1] == 0 && leftStrandCnt[0] > 0 )
175 return true ;
176 return false ;
177 }
178
179 // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript
180 // the partial compatible case (return 2) mostly likely happen in DP where we have partial transcript.
181 int TranscriptDecider::IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c )
182 {
183 //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ;
184 if ( c.first < transcript.first || c.first > transcript.last
185 || !transcript.seVector.Test( c.first )
186 || ( !transcript.partial && !transcript.seVector.Test( c.last ) ) ) // no overlap or starts too early or some chosen subexons does not compatible
187 return 0 ;
188
189 // Extract the subexons we should focus on.
190 int s, e ;
191 s = c.first ;
192 e = c.last ;
193 bool returnPartial = false ;
194 if ( e > transcript.last ) // constraints ends after the transcript.
195 {
196 if ( transcript.partial )
197 {
198 e = transcript.last ;
199 returnPartial = true ;
200 }
201 else
202 return 0 ;
203 }
204 /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e,
205 transcript.seVector.Test(0), transcript.seVector.Test(1),
206 c.vector.Test(0), c.vector.Test(1) ) ;*/
207
208 compatibleTestVectorT.Assign( transcript.seVector ) ;
209 //compatibleTestVectorT.MaskRegionOutsideInRange( s, e, transcript.first, transcript.last ) ;
210 compatibleTestVectorT.MaskRegionOutside( s, e ) ;
211
212 compatibleTestVectorC.Assign( c.vector ) ;
213 if ( c.last > transcript.last )
214 {
215 //compatibleTestVectorC.MaskRegionOutsideInRange( s, e, c.first, c.last ) ;
216 //compatibleTestVectorC.MaskRegionOutside( s, e ) ;
217 compatibleTestVectorC.MaskRegionOutside( 0, e ) ; // Because the bits before s are already all 0s in C.
218 }
219 /*printf( "after masking %d %d. %d %d %d %d:\n", s, e, transcript.first, transcript.last, c.first, c.last ) ;
220 compatibleTestVectorT.Print() ;
221 compatibleTestVectorC.Print() ; */
222 // Test compatible.
223 int ret = 0 ;
224 if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) )
225 {
226 if ( returnPartial )
227 ret = 2 ;
228 else
229 ret = 1 ;
230 }
231
232 return ret ;
233 }
234
235 int TranscriptDecider::IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c )
236 {
237 //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ;
238 if ( c.first < transcript.first || c.first > transcript.last ) // no overlap or starts too early.
239 return 0 ;
240 printf( "hi\n" ) ;
241 // Extract the subexons we should focus on.
242 int s, e ;
243 s = c.first ;
244 e = c.last ;
245 bool returnPartial = false ;
246 if ( e > transcript.last ) // constraints ends after the transcript.
247 {
248 if ( transcript.partial )
249 {
250 e = transcript.last ;
251 returnPartial = true ;
252 }
253 else
254 return 0 ;
255 }
256 /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e,
257 transcript.seVector.Test(0), transcript.seVector.Test(1),
258 c.vector.Test(0), c.vector.Test(1) ) ;*/
259
260 compatibleTestVectorT.Assign( transcript.seVector ) ;
261 compatibleTestVectorT.MaskRegionOutside( s, e ) ;
262
263 compatibleTestVectorC.Assign( c.vector ) ;
264 if ( e > transcript.last )
265 compatibleTestVectorC.MaskRegionOutside( s, e ) ;
266 /*printf( "after masking: (%d %d) (%d %d)\n",
267 compatibleTestVectorT.Test(0), compatibleTestVectorT.Test(1),
268 compatibleTestVectorC.Test(0), compatibleTestVectorC.Test(1) ) ;*/
269
270 // Test compatible.
271 int ret = 0 ;
272 if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) )
273 {
274 if ( returnPartial )
275 ret = 2 ;
276 else
277 ret = 1 ;
278 }
279 compatibleTestVectorT.Print() ;
280 compatibleTestVectorC.Print() ;
281 printf( "ret=%d\n", ret ) ;
282 return ret ;
283 }
284 int TranscriptDecider::SubTranscriptCount( int tag, struct _subexon *subexons, int *f )
285 {
286 if ( f[tag] != -1 )
287 return f[tag] ;
288
289 int ret = 0 ;
290 int i ;
291 if ( subexons[tag].canBeEnd )
292 ret = 1 ;
293 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i )
294 {
295 ret += SubTranscriptCount( subexons[tag].next[i], subexons, f ) ;
296 }
297
298 if ( ret == 0 )
299 ret = 1 ;
300 return f[tag] = ret ;
301 }
302
303 void TranscriptDecider::CoalesceSameTranscripts( std::vector<struct _transcript> &t )
304 {
305 int i, k ;
306 if ( t.size() == 0 )
307 return ;
308
309 std::sort( t.begin(), t.end(), CompSortTranscripts ) ;
310
311 int size = t.size() ;
312 k = 0 ;
313 for ( i = 1 ; i < size ; ++i )
314 {
315 if ( t[k].seVector.IsEqual( t[i].seVector ) )
316 {
317 t[k].abundance += t[i].abundance ;
318 t[i].seVector.Release() ;
319 }
320 else
321 {
322 ++k ;
323 if ( i != k )
324 t[k] = t[i] ;
325 }
326 }
327 t.resize( k + 1 ) ;
328 }
329
330 void TranscriptDecider::EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt )
331 {
332 int i ;
333 visit[ vcnt ] = tag ;
334 //printf( "%s: %d %d %d %d. %d %d\n", __func__, vcnt, tag, subexons[tag].nextCnt, strand, subexons[tag].start, subexons[tag].end ) ;
335 // Compute the correlation score
336 double minCor = correlationScore ;
337 for ( i = 0 ; i < vcnt ; ++i )
338 {
339 double tmp = correlation.Query( visit[i], visit[vcnt] ) ;
340 if ( tmp < minCor )
341 minCor = tmp ;
342 }
343
344 if ( subexons[tag].canBeEnd )
345 {
346 struct _transcript &txpt = alltranscripts[atcnt] ;
347 for ( i = 0 ; i <= vcnt ; ++i )
348 txpt.seVector.Set( visit[i] ) ;
349
350 txpt.first = visit[0] ;
351 txpt.last = visit[vcnt] ;
352 txpt.partial = false ;
353 txpt.correlationScore = minCor ;
354
355 //printf( "%lf %d %d ", txpt.correlationScore, vcnt, visit[0] ) ;
356 //txpt.seVector.Print() ;
357 ++atcnt ;
358 }
359
360 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i )
361 {
362 int a = subexons[tag].next[i] ;
363 if ( !SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand )
364 && subexons[a].start > subexons[tag].end + 1 )
365 continue ;
366 int backupStrand = strand ;
367 if ( subexons[a].start > subexons[tag].end + 1 && strand == 0 )
368 strand = subexons[tag].rightStrand ;
369 EnumerateTranscript( subexons[tag].next[i], strand, visit, vcnt + 1, subexons, correlation, minCor, alltranscripts, atcnt ) ;
370 strand = backupStrand ;
371 }
372 }
373
374 void TranscriptDecider::SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt,
375 std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr )
376 {
377 int i ;
378 int size ;
379 double cover ;
380 bool keepSearch = true ;
381 bool belowMin = false ;
382
383 struct _subexon *subexons = attr.subexons ;
384
385 visit[vcnt] = tag ;
386 ++vcnt ;
387 struct _dp visitdp ;
388
389 visitdp.cover = -1 ;
390
391 struct _transcript &subTxpt = attr.bufferTxpt ;
392 subTxpt.seVector.Reset() ;
393 for ( i = 0 ; i < pcnt ; ++i )
394 subTxpt.seVector.Set( parents[i] ) ;
395 subTxpt.first = parents[0] ;
396 subTxpt.last = parents[ pcnt - 1] ;
397 for ( i = 0 ; i < vcnt ; ++i )
398 subTxpt.seVector.Set( visit[i] ) ;
399 subTxpt.last = visit[ vcnt - 1 ] ;
400 subTxpt.partial = true ;
401
402 // Adjust the extendsCnt
403 /*printf( "%s: %d %d %d\n", __func__, vcnt , extendCnt, extends[ extendCnt - 1] ) ;
404 subTxpt.seVector.Print() ;
405 tc[extends[extendCnt - 1]].vector.Print() ;
406 printf( "Adjust extend:\n") ;*/
407 for ( i = extendCnt - 1 ; i >= 0 ; --i )
408 {
409 if ( tc[ extends[i] ].last <= tag || ( tc[ extends[i] ].vector.Test( tag ) && IsConstraintInTranscript( subTxpt, tc[ extends[i] ] ) != 0 ) )
410 break ;
411 }
412 extendCnt = i + 1 ;
413
414 // If the extension ends.
415 subTxpt.partial = false ;
416 if ( subexons[tag].nextCnt > 0 && ( extendCnt == 0 || tag >= tc[ extends[ extendCnt - 1 ] ].last ) )
417 {
418 // Solve the subtranscript beginning with visit.
419 // Now we got the optimal transcript for visit.
420 visitdp = SolveSubTranscript( visit, vcnt, strand, tc, tcStartInd, attr ) ;
421 keepSearch = false ;
422 }
423 //printf( "%s %d %d: visitdp.cover=%lf\n", __func__, parents[0], tag, visitdp.cover ) ;
424
425 // the constraints across the parents and visit.
426 size = tc.size() ;
427 if ( visitdp.cover >= 0 )
428 {
429 cover = visitdp.cover ;
430 // Reset the subTxpt, since its content is modofitied in SolveSubTxpt called above.
431 subTxpt.seVector.Reset() ;
432 for ( i = 0 ; i < pcnt ; ++i )
433 subTxpt.seVector.Set( parents[i] ) ;
434 subTxpt.seVector.Or( visitdp.seVector ) ;
435 subTxpt.first = parents[0] ;
436 subTxpt.last = visitdp.last ;
437 subTxpt.partial = false ;
438
439 if ( !attr.forAbundance && attr.minAbundance > 0 )
440 {
441 for ( i = 0 ; i < pcnt - 1 ; ++i )
442 {
443 if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() )
444 belowMin = true ;
445 }
446 for ( i = -1 ; i < vcnt - 1 ; ++i )
447 {
448 if ( i == -1 && pcnt >= 1 )
449 {
450 if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() )
451 belowMin = true ;
452 }
453 else
454 {
455 if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() )
456 belowMin = true ;
457 }
458 }
459 if ( attr.forAbundance && belowMin )
460 cover = 1e-6 ;
461 }
462
463 for ( i = tcStartInd ; i < size ; ++i )
464 {
465 if ( tc[i].first > parents[ pcnt - 1] )
466 break ;
467
468 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
469 {
470 if ( tc[i].normAbund <= attr.minAbundance )
471 {
472 belowMin = true ;
473 cover = -2 ;
474 break ;
475 }
476
477 if ( tc[i].abundance <= 0 )
478 continue ;
479
480 if ( attr.forAbundance )
481 {
482 if ( tc[i].normAbund < cover || cover == 0 )
483 cover = tc[i].normAbund ;
484 }
485 else
486 {
487 ++cover ;
488 }
489 }
490 }
491 if ( belowMin && pdp.cover == -1 )
492 {
493 pdp.cover = -2 ;
494 pdp.seVector.Assign( subTxpt.seVector ) ;
495 pdp.first = subTxpt.first ;
496 pdp.last = subTxpt.last ;
497 pdp.strand = strand ;
498 }
499 else if ( cover > pdp.cover )
500 {
501 pdp.cover = cover ;
502 pdp.seVector.Assign( subTxpt.seVector ) ;
503 pdp.first = subTxpt.first ;
504 pdp.last = subTxpt.last ;
505 pdp.strand = strand ;
506 }
507 }
508 else if ( visitdp.cover == -2 && pdp.cover == -1 ) // no valid extension from visit
509 {
510 subTxpt.seVector.Reset() ;
511 for ( i = 0 ; i < pcnt ; ++i )
512 subTxpt.seVector.Set( parents[i] ) ;
513 subTxpt.seVector.Or( visitdp.seVector ) ;
514 subTxpt.first = parents[0] ;
515 subTxpt.last = visitdp.last ;
516
517 pdp.cover = -2 ;
518 pdp.seVector.Assign( subTxpt.seVector ) ;
519 pdp.first = subTxpt.first ;
520 pdp.last = subTxpt.last ;
521 pdp.strand = strand ;
522 }
523
524 if ( subexons[tag].canBeEnd && ( visitdp.cover < 0 || attr.forAbundance ) )
525 // This works is because that the extension always covers more constraints. So we only go this branch if the extension does not work
526 // and it goes this branch if it violates minAbundance
527 // But we need to go here when we want to compute the maxAbundance transcript.
528 // This part also works as the exit point of the recurive function.
529 {
530 bool belowMin = false ;
531 subTxpt.seVector.Reset() ;
532 for ( i = 0 ; i < pcnt ; ++i )
533 subTxpt.seVector.Set( parents[i] ) ;
534 for ( i = 0 ; i < vcnt ; ++i )
535 subTxpt.seVector.Set( visit[i] ) ;
536 subTxpt.first = parents[0] ;
537 subTxpt.last = visit[ vcnt - 1] ;
538 subTxpt.partial = false ;
539
540 cover = 0 ;
541 if ( attr.forAbundance || attr.minAbundance > 0 )
542 {
543 for ( i = 0 ; i < pcnt - 1 ; ++i )
544 {
545 if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() )
546 belowMin = true ;
547 }
548 for ( i = -1 ; i < vcnt - 1 ; ++i )
549 {
550 if ( i == -1 && pcnt >= 1 )
551 {
552 if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() )
553 belowMin = true ;
554 }
555 else
556 {
557 if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() )
558 belowMin = true ;
559 }
560 }
561
562 //if ( belowMin == true )
563 // printf( "turned belowMin. %d. %d %d: %d %d %d\n", attr.uncoveredPair.size(), pcnt, vcnt, parents[0], visit[0], visit[ vcnt - 1] ) ;
564
565 if ( attr.forAbundance && belowMin )
566 cover = 1e-6 ;
567 }
568
569 for ( i = tcStartInd ; i < size ; ++i )
570 {
571 // note that the value is parents[ pcnt - 1], because
572 // in above the part of "visit" is computed in SolveSubTranscript( visit ).
573 if ( tc[i].first > visit[ vcnt - 1] )
574 break ;
575 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
576 {
577 if ( tc[i].normAbund <= attr.minAbundance )
578 {
579 belowMin = true ;
580 cover = -2 ;
581 break ;
582 }
583
584 if ( tc[i].abundance <= 0 )
585 continue ;
586 if ( attr.forAbundance )
587 {
588 if ( tc[i].normAbund < cover || cover == 0 )
589 cover = tc[i].normAbund ;
590 }
591 else
592 {
593 ++cover ;
594 }
595 }
596 }
597
598 if ( belowMin && pdp.cover == -1 )
599 {
600 pdp.cover = -2 ;
601 pdp.seVector.Assign( subTxpt.seVector ) ;
602 pdp.first = subTxpt.first ;
603 pdp.last = subTxpt.last ;
604 pdp.strand = strand ;
605 }
606 else if ( cover > pdp.cover )
607 {
608 pdp.cover = cover ;
609 pdp.seVector.Assign( subTxpt.seVector ) ;
610 pdp.first = subTxpt.first ;
611 pdp.last = subTxpt.last ;
612 pdp.strand = strand ;
613 }
614 }
615 //printf( "%s %d: pdp.cover=%lf\n", __func__, tag, pdp.cover ) ;
616
617 // keep searching.
618 if ( keepSearch )
619 {
620 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i )
621 {
622 int b = subexons[tag].next[i] ;
623 if ( ( SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand )
624 && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) ||
625 subexons[b].start == subexons[tag].end + 1 )
626 {
627 int backupStrand = strand ;
628 if ( subexons[b].start > subexons[tag].end + 1 )
629 strand = subexons[tag].rightStrand ;
630
631 SearchSubTranscript( subexons[tag].next[i], strand, parents, pcnt, pdp, visit, vcnt,
632 extends, extendCnt, tc, tcStartInd, attr ) ;
633 strand = backupStrand ;
634 }
635 }
636
637 }
638
639 return ;
640 }
641
642 struct _dp TranscriptDecider::SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr )
643 {
644 int i ;
645 int size ;
646 /*printf( "%s: ", __func__ ) ;
647 for ( i = 0 ; i < vcnt ; ++i )
648 printf( "%d ", visit[i] ) ;
649 printf( ": %lf %d %d", attr.f1[ visit[0] ].cover, attr.f1[ visit[0] ].timeStamp, attr.timeStamp ) ;
650 printf( "\n" ) ;*/
651 // Test whether it is stored in dp
652 if ( vcnt == 1 )
653 {
654 if ( attr.f1[ visit[0] ].cover != -1 && attr.f1[ visit[0] ].strand == strand && ( attr.f1[ visit[0] ].timeStamp == attr.timeStamp ||
655 ( attr.f1[ visit[0] ].minAbundance < attr.minAbundance && attr.f1[visit[0]].cover == -2 ) ) ) //even given lower minAbundance threshold, it fails
656 {
657 return attr.f1[ visit[0] ] ;
658 }
659 }
660 else if ( vcnt == 2 && attr.f2 )
661 {
662 int a = visit[0] ;
663 int b = visit[1] ;
664
665 if ( attr.f2[a][b].cover != -1 && attr.f2[a][b].strand == strand && ( attr.f2[a][b].timeStamp == attr.timeStamp ||
666 ( attr.f2[a][b].minAbundance < attr.minAbundance && attr.f2[a][b].cover == -2 ) ) )
667 {
668 return attr.f2[a][b] ;
669 }
670 }
671 else
672 {
673 int key = 0 ;
674 for ( i = 0 ; i < vcnt ; ++i )
675 key = ( key * attr.seCnt + visit[i] ) % hashMax ;
676 if ( key < 0 )
677 key += hashMax ;
678
679 if ( attr.hash[key].cover != -1 && attr.hash[key].cnt == vcnt && attr.hash[key].strand == strand &&
680 ( attr.hash[key].first == visit[0] ) &&
681 ( attr.hash[key].timeStamp == attr.timeStamp ||
682 ( attr.hash[key].minAbundance < attr.minAbundance && attr.hash[key].cover == -2 ) ) )
683 {
684 struct _transcript subTxpt = attr.bufferTxpt ;
685 subTxpt.seVector.Reset() ;
686 for ( i = 0 ; i < vcnt ; ++i )
687 subTxpt.seVector.Set( visit[i] ) ;
688 //subTxpt.seVector.Print() ;
689 //attr.hash[key].seVector.Print() ;
690 subTxpt.seVector.Xor( attr.hash[key].seVector ) ;
691 subTxpt.seVector.MaskRegionOutside( visit[0], visit[ vcnt - 1] ) ;
692 //printf( "hash test: %d %d\n", key, subTxpt.seVector.IsAllZero() ) ;
693 if ( subTxpt.seVector.IsAllZero() )
694 {
695 return attr.hash[key] ;
696 }
697
698 // Can't use the code below, because vcnt is the header of subexons.
699 /*for ( i = 0 ; i < vcnt ; ++i )
700 if ( !attr.hash[key].seVector.Test( visit[i] ) )
701 break ;
702 if ( i >= vcnt )
703 return attr.hash[key] ;*/
704
705 }
706 }
707 // adjust tcStartInd
708 size = tc.size() ;
709 for ( i = tcStartInd ; i < size ; ++i )
710 if ( tc[i].first >= visit[0] )
711 break ;
712 tcStartInd = i ;
713
714
715 struct _subexon *subexons = attr.subexons ;
716 struct _dp visitdp ;
717 visitdp.seVector.Init( attr.seCnt ) ;
718 visitdp.cover = -1 ;
719
720 struct _transcript &subTxpt = attr.bufferTxpt ;
721 // This happens when it is called from PickTranscriptsByDP, the first subexon might be the end.
722 subTxpt.seVector.Reset() ;
723 for ( i = 0 ; i < vcnt ; ++i )
724 subTxpt.seVector.Set( visit[i] ) ;
725 subTxpt.first = visit[0] ;
726 subTxpt.last = visit[vcnt - 1] ;
727
728 if ( subexons[ visit[vcnt - 1] ].canBeEnd )
729 {
730 subTxpt.partial = false ;
731 double cover = 0 ;
732 for ( i = tcStartInd ; i < size ; ++i )
733 {
734 if ( tc[i].first > subTxpt.last )
735 break ;
736
737 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
738 {
739 if ( tc[i].normAbund <= attr.minAbundance )
740 {
741 cover = -2 ;
742 break ;
743 }
744
745 if ( tc[i].abundance <= 0 )
746 continue ;
747 if ( attr.forAbundance )
748 {
749 if ( tc[i].normAbund < cover || cover == 0 )
750 cover = tc[i].normAbund ;
751 }
752 else
753 ++cover ;
754 }
755 }
756
757 visitdp.seVector.Assign( subTxpt.seVector ) ;
758 visitdp.cover = cover ;
759 visitdp.first = subTxpt.first ;
760 visitdp.last = subTxpt.last ;
761 visitdp.strand = strand ;
762 }
763
764 // Now we extend.
765 size = tc.size() ;
766 int *extends = new int[tc.size() - tcStartInd + 1] ;
767 int extendCnt = 0 ;
768 subTxpt.partial = true ;
769 for ( i = tcStartInd ; i < size ; ++i )
770 {
771 if ( tc[i].first > subTxpt.last )
772 break ;
773 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 2 )
774 {
775 extends[extendCnt] = i ;
776 ++extendCnt ;
777 }
778 }
779
780 // Sort the extend by the index of the last subexon.
781 if ( extendCnt > 0 )
782 {
783 struct _pair32 *extendsPairs = new struct _pair32[extendCnt] ;
784
785 for ( i = 0 ; i < extendCnt ; ++i )
786 {
787 extendsPairs[i].a = extends[i] ;
788 extendsPairs[i].b = tc[ extends[i] ].last ;
789 }
790 qsort( extendsPairs, extendCnt, sizeof( struct _pair32 ), CompPairsByB ) ;
791
792 for ( i = 0 ; i < extendCnt ; ++i )
793 extends[i] = extendsPairs[i].a ;
794
795 delete[] extendsPairs ;
796 }
797
798 size = subexons[ visit[vcnt - 1] ].nextCnt ;
799 int nextvCnt = 1 ;
800 if ( extendCnt > 0 && tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] > 1 )
801 nextvCnt = tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] ;
802 int *nextv = new int[ nextvCnt ] ;
803 for ( i = 0 ; i < size ; ++i )
804 {
805 int a = visit[vcnt - 1] ;
806 int b = subexons[a].next[i] ;
807 if ( ( SubexonGraph::IsSameStrand( subexons[a].rightStrand, strand )
808 && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) )
809 ||
810 subexons[b].start == subexons[a].end + 1 )
811 {
812 int backupStrand = strand ;
813 if ( subexons[b].start > subexons[a].end + 1 )
814 strand = subexons[a].rightStrand ;
815 SearchSubTranscript( subexons[ visit[vcnt - 1] ].next[i], strand, visit, vcnt, visitdp, nextv, 0, extends, extendCnt, tc, tcStartInd, attr ) ;
816 strand = backupStrand ;
817
818 }
819 }
820 //printf( "%s %d(%d) %d %d %d: %lf\n", __func__, visit[0], subexons[ visit[vcnt - 1] ].canBeEnd, size, extendCnt, strand, visitdp.cover ) ;
821 delete[] nextv ;
822 delete[] extends ;
823
824 // store the result in the dp structure.
825 // We return the structure stored in dp to simplify the memory access pattern.
826 // In other words, we assume the structure returned from this function always uses the memory from attr.dp
827 if ( vcnt == 1 )
828 {
829 SetDpContent( attr.f1[ visit[0] ], visitdp, attr ) ;
830 visitdp.seVector.Release() ;
831 return attr.f1[ visit[0] ] ;
832 }
833 else if ( vcnt == 2 && attr.f2 )
834 {
835 SetDpContent( attr.f2[ visit[0] ][ visit[1] ], visitdp, attr ) ;
836 visitdp.seVector.Release() ;
837 return attr.f2[ visit[0] ][ visit[1] ] ;
838 }
839 else
840 {
841 int key = 0 ;
842 for ( i = 0 ; i < vcnt ; ++i )
843 key = ( key * attr.seCnt + visit[i] ) % hashMax ;
844 if ( key < 0 )
845 key += hashMax ;
846
847 //static int hashUsed = 0 ;
848 //if ( attr.hash[key].cover == -1 )
849 // ++hashUsed ;
850 //printf( "%d/%d\n", hashUsed, HASH_MAX) ;
851 //printf( "hash write: %d\n", key ) ;
852 SetDpContent( attr.hash[key], visitdp, attr ) ;
853 attr.hash[key].cnt = vcnt ;
854 visitdp.seVector.Release() ;
855 return attr.hash[key] ;
856 }
857 }
858
859
860 void TranscriptDecider::PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &alltranscripts )
861 {
862 int i, j, k ;
863
864 std::vector<struct _transcript> transcripts ;
865 std::vector<struct _constraint> &tc = constraints.constraints ;
866 int tcCnt = tc.size() ;
867 int coalesceThreshold = 1024 ;
868
869 //printf( "tcCnt=%d\n", tcCnt ) ;
870
871 attr.timeStamp = 1 ;
872 attr.bufferTxpt.seVector.Init( seCnt ) ;
873 attr.subexons = subexons ;
874 attr.seCnt = seCnt ;
875
876 double maxAbundance = -1 ;
877 // Initialize the dp data structure
878 /*memset( attr.f1, -1, sizeof( struct _dp ) * seCnt ) ;
879 for ( i = 0 ; i < seCnt ; ++i )
880 memset( attr.f2[i], -1, sizeof( struct _dp ) * seCnt ) ;
881 memset( attr.hash, -1, sizeof( struct _dp ) * HASH_MAX ) ;*/
882 for ( i = 0 ; i < seCnt ; ++i )
883 ResetDpContent( attr.f1[i] ) ;
884 for ( i = 0 ; i < seCnt && attr.f2 ; ++i )
885 for ( j = i ; j < seCnt ; ++j )
886 ResetDpContent( attr.f2[i][j] ) ;
887 for ( i = 0 ; i < hashMax ; ++i )
888 ResetDpContent( attr.hash[i] ) ;
889
890 // Set the uncovered pair
891 attr.uncoveredPair.clear() ;
892 BitTable bufferTable( seCnt ) ;
893 k = 0 ;
894 for ( i = 0 ; i < seCnt ; ++i )
895 {
896 for ( ; k < tcCnt ; ++k )
897 {
898 if ( tc[k].last >= i )
899 break ;
900 }
901
902 if ( k >= tcCnt || tc[k].first > i )
903 {
904 for ( j = 0 ; j < subexons[i].nextCnt ; ++j )
905 {
906 attr.uncoveredPair[i * seCnt + subexons[i].next[j] ] = 1 ;
907 }
908 continue ;
909 }
910
911 for ( j = 0 ; j < subexons[i].nextCnt ; ++j )
912 {
913 bool covered = false ;
914 int l, n ;
915
916 n = subexons[i].next[j] ;
917 for ( l = k ; l < tcCnt ; ++l )
918 {
919 if ( tc[l].first > i )
920 break ;
921 if ( tc[l].vector.Test( i ) && tc[l].vector.Test( n ) )
922 {
923 if ( n == i + 1 )
924 {
925 covered = true ;
926 break ;
927 }
928 else
929 {
930 bufferTable.Assign( tc[l].vector ) ;
931 bufferTable.MaskRegionOutside( i + 1, n - 1 ) ;
932 if ( bufferTable.IsAllZero() )
933 {
934 covered = true ;
935 break ;
936 }
937 }
938 }
939 }
940
941 if ( !covered )
942 {
943 //printf( "set!: (%d: %d %d) (%d: %d %d)\n", i, subexons[i].start, subexons[i].end, n, subexons[n].start, subexons[n].end ) ;
944 attr.uncoveredPair[ i * seCnt + n ] = 1 ;
945 }
946 }
947 }
948 bufferTable.Release() ;
949
950
951 // Find the max abundance
952 attr.forAbundance = true ;
953 attr.minAbundance = 0 ;
954 for ( i = 0 ; i < seCnt ; ++i )
955 {
956 if ( subexons[i].canBeStart )
957 {
958 int visit[1] = {i} ;
959 struct _dp tmp ;
960
961 tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ;
962
963 if ( tmp.cover > maxAbundance )
964 maxAbundance = tmp.cover ;
965 }
966 }
967 //PrintLog( "maxAbundance=%lf", maxAbundance ) ;
968 //exit( 1 ) ;
969
970 // Pick the transcripts. Quantative Set-Cover
971 // Notice that by the logic in SearchSubTxpt and SolveSubTxpt, we don't need to reinitialize the data structure.
972 attr.forAbundance = false ;
973 int *coveredTc = new int[tcCnt] ;
974 int coveredTcCnt ;
975 struct _dp maxCoverDp ;
976 struct _dp bestDp ;
977 std::map<double, struct _dp> cachedCoverResult ;
978
979 maxCoverDp.seVector.Init( seCnt ) ;
980 bestDp.seVector.Init( seCnt ) ;
981 int iterCnt = 0 ;
982
983 while ( 1 )
984 {
985 double bestScore ;
986
987 // iterately assign constraints
988 attr.minAbundance = 0 ;
989
990 // Find the best candidate transcript.
991 bestDp.cover = -1 ;
992 bestScore = -1 ;
993 while ( 1 )
994 {
995 // iterate the change of minAbundance
996 if ( cachedCoverResult.find( attr.minAbundance ) != cachedCoverResult.end() )
997 {
998 struct _dp tmp = cachedCoverResult[ attr.minAbundance ] ;
999 SetDpContent( maxCoverDp, tmp, attr ) ;
1000 }
1001 else
1002 {
1003 maxCoverDp.cover = -1 ;
1004 ++attr.timeStamp ;
1005 for ( i = 0 ; i < seCnt ; ++i )
1006 {
1007 if ( subexons[i].canBeStart == false )
1008 continue ;
1009 int visit[1] = {i} ;
1010 struct _dp tmp ;
1011 tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ;
1012
1013 if ( tmp.cover > maxCoverDp.cover && tmp.cover > 0 )
1014 {
1015 SetDpContent( maxCoverDp, tmp, attr ) ;
1016 }
1017 //if ( subexons[i].start == 6870264 || subexons[i].start == 6872237 )
1018 // printf( "%d: %lf\n", i, tmp.cover ) ;
1019 }
1020
1021 if ( maxCoverDp.cover == -1 )
1022 break ;
1023 struct _dp ccr ;
1024 ccr.seVector.Init( seCnt ) ;
1025 SetDpContent( ccr, maxCoverDp, attr ) ;
1026 cachedCoverResult[ attr.minAbundance ] = ccr ;
1027 }
1028 // the abundance for the max cover txpt.
1029 double min = -1 ;
1030 struct _transcript &subTxpt = attr.bufferTxpt ;
1031 subTxpt.seVector.Assign( maxCoverDp.seVector ) ;
1032 subTxpt.first = maxCoverDp.first ;
1033 subTxpt.last = maxCoverDp.last ;
1034
1035 for ( i = 0 ; i < tcCnt ; ++i )
1036 {
1037 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
1038 {
1039 if ( tc[i].normAbund < min || min == -1 )
1040 min = tc[i].normAbund ;
1041 }
1042 }
1043
1044 if ( attr.minAbundance == 0 )
1045 {
1046 std::vector<int> subexonIdx ;
1047 maxCoverDp.seVector.GetOnesIndices( subexonIdx ) ;
1048 int size = subexonIdx.size() ;
1049 for ( i = 0 ; i < size - 1 ; ++i )
1050 if ( attr.uncoveredPair.find( subexonIdx[i] * seCnt + subexonIdx[i + 1] ) != attr.uncoveredPair.end() )
1051 {
1052 min = 1e-6 ;
1053 break ;
1054 }
1055 }
1056
1057 double score = ComputeScore( maxCoverDp.cover, 1.0, min, maxAbundance, 0 ) ;
1058 if ( bestScore == -1 || score > bestScore )
1059 {
1060 bestScore = score ;
1061 SetDpContent( bestDp, maxCoverDp, attr ) ;
1062 }
1063 else if ( score < bestScore )
1064 {
1065 if ( ComputeScore( maxCoverDp.cover, 1.0, maxAbundance, maxAbundance, 0 ) < bestScore )
1066 break ;
1067 }
1068 //PrintLog( "normAbund=%lf maxCoverDp.cover=%lf score=%lf timeStamp=%d", min, maxCoverDp.cover, score, attr.timeStamp ) ;
1069 attr.minAbundance = min ;
1070 } // end of iteration for minAbundance.
1071
1072 if ( bestDp.cover == -1 )
1073 break ;
1074 // Assign the constraints.
1075 coveredTcCnt = 0 ;
1076 double update = -1 ;
1077 struct _transcript &subTxpt = attr.bufferTxpt ;
1078 subTxpt.seVector.Assign( bestDp.seVector ) ;
1079 subTxpt.first = bestDp.first ;
1080 subTxpt.last = bestDp.last ;
1081 subTxpt.partial = false ;
1082 for ( i = 0 ; i < tcCnt ; ++i )
1083 {
1084 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
1085 {
1086 if ( tc[i].abundance > 0 &&
1087 ( tc[i].abundance < update || update == -1 ) )
1088 {
1089 update = tc[i].abundance ;
1090 }
1091 coveredTc[ coveredTcCnt ] = i ;
1092 ++coveredTcCnt ;
1093 }
1094 /*else
1095 {
1096 printf( "%d: ", i ) ;
1097 tc[i].vector.Print() ;
1098 if ( i == 127 )
1099 {
1100 printf( "begin debug:\n" ) ;
1101 IsConstraintInTranscriptDebug( subTxpt, tc[i] ) ;
1102 }
1103 }*/
1104 }
1105 update *= ( 1 + iterCnt / 50 ) ;//* ( 1 + iterCnt / 50 ) ;
1106
1107 //PrintLog( "%d: update=%lf %d %d. %d %d %d", iterCnt, update, coveredTcCnt, tcCnt,
1108 // bestDp.first, bestDp.last, subexons[ bestDp.first ].start ) ;
1109 //bestDp.seVector.Print() ;
1110
1111 struct _transcript nt ;
1112 nt.seVector.Duplicate( bestDp.seVector ) ;
1113 nt.first = bestDp.first ;
1114 nt.last = bestDp.last ;
1115 nt.partial = false ;
1116 nt.abundance = 0 ;
1117 for ( i = 0 ; i < coveredTcCnt ; ++i )
1118 {
1119 j = coveredTc[i] ;
1120 if ( tc[j].abundance > 0 )
1121 {
1122 double tmp = ( tc[j].abundance > update ? update : tc[j].abundance ) ;
1123 tc[j].abundance -= tmp ;
1124 double factor = 1 ;
1125
1126 nt.abundance += ( tc[j].support * update / tc[j].normAbund * factor ) ;
1127
1128 if ( tc[j].abundance <= 0 )
1129 {
1130 std::vector<double> removeKey ;
1131 for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it )
1132 {
1133 subTxpt.seVector.Assign( it->second.seVector ) ;
1134 subTxpt.first = it->second.first ;
1135 subTxpt.last = it->second.last ;
1136 subTxpt.partial = false ;
1137 if ( IsConstraintInTranscript( subTxpt, tc[j] ) == 1 )
1138 {
1139 it->second.seVector.Release() ;
1140 removeKey.push_back( it->first ) ;
1141 }
1142 }
1143 int size = removeKey.size() ;
1144 int l ;
1145 for ( l = 0 ; l < size ; ++l )
1146 cachedCoverResult.erase( removeKey[l] ) ;
1147 }
1148 }
1149
1150 if ( tc[j].abundance < 0 )
1151 tc[j].abundance = 0 ;
1152 }
1153
1154 transcripts.push_back( nt ) ;
1155 if ( transcripts.size() >= transcripts.capacity() && (int)transcripts.size() >= coalesceThreshold )
1156 {
1157 CoalesceSameTranscripts( transcripts ) ;
1158 if ( transcripts.size() >= transcripts.capacity() / 2 )
1159 coalesceThreshold *= 2 ;
1160 }
1161 ++iterCnt ;
1162
1163 if ( iterCnt >= iterBound )
1164 break ;
1165 }
1166 CoalesceSameTranscripts( transcripts ) ;
1167 int size = transcripts.size() ;
1168 // Compute the correlation score
1169 for ( i = 0 ; i < size ; ++i )
1170 {
1171 std::vector<int> subexonInd ;
1172 transcripts[i].seVector.GetOnesIndices( subexonInd ) ;
1173 double cor = 2.0 ;
1174 int s = subexonInd.size() ;
1175 for ( j = 0 ; j < s ; ++j )
1176 for ( k = j + 1 ; k < s ; ++k )
1177 {
1178 double tmp = correlation.Query( subexonInd[j], subexonInd[k] ) ;
1179 if ( tmp < cor )
1180 cor = tmp ;
1181 }
1182 if ( cor > 1 )
1183 cor = 0 ;
1184 transcripts[i].correlationScore = cor ;
1185 }
1186
1187 // store the result
1188 for ( i = 0 ; i < size ; ++i )
1189 alltranscripts.push_back( transcripts[i] ) ;
1190
1191 // Release the memory
1192 for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it )
1193 {
1194 it->second.seVector.Release() ;
1195 }
1196 attr.bufferTxpt.seVector.Release() ;
1197
1198 delete[] coveredTc ;
1199 maxCoverDp.seVector.Release() ;
1200 bestDp.seVector.Release() ;
1201 }
1202
1203
1204 // Add the preifx/suffix of transcripts to the list
1205 void TranscriptDecider::AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend )
1206 {
1207 int i, j, k ;
1208 int size = alltranscripts.size() ;
1209 if ( size >= limit )
1210 return ;
1211
1212 // Augment suffix, prefix transcripts
1213 for ( i = 0 ; i < size ; ++i )
1214 {
1215 std::vector<int> subexonIdx ;
1216 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ;
1217 int seIdxCnt = subexonIdx.size() ;
1218 // suffix
1219 for ( j = 1 ; j < seIdxCnt ; ++j )
1220 {
1221 if ( subexons[ subexonIdx[j] ].canBeStart )
1222 {
1223 struct _transcript nt ;
1224 nt.first = subexonIdx[j] ;
1225 nt.last = alltranscripts[i].last ;
1226 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
1227 nt.seVector.MaskRegionOutside( nt.first, nt.last ) ;
1228 nt.partial = false ;
1229 nt.correlationScore = 0 ;
1230 nt.abundance = 0 ;
1231 nt.constraintsSupport = NULL ;
1232
1233 alltranscripts.push_back( nt ) ;
1234 if ( alltranscripts.size() >= limit )
1235 return ;
1236 }
1237 }
1238
1239 // prefix
1240 for ( j = 0 ; j < seIdxCnt - 1 ; ++j )
1241 {
1242 if ( subexons[ subexonIdx[j] ].canBeEnd )
1243 {
1244 struct _transcript nt ;
1245 nt.first = alltranscripts[i].first ;
1246 nt.last = subexonIdx[j] ;
1247 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
1248 nt.seVector.MaskRegionOutside( nt.first, nt.last ) ;
1249 nt.partial = false ;
1250 nt.correlationScore = 0 ;
1251 nt.abundance = 0 ;
1252 nt.constraintsSupport = NULL ;
1253
1254 alltranscripts.push_back( nt ) ;
1255 if ( alltranscripts.size() >= limit )
1256 return ;
1257 }
1258 }
1259
1260 if ( extend )
1261 {
1262 //Extentions right.
1263 for ( j = 0 ; j < seIdxCnt ; ++j )
1264 {
1265 if ( subexons[ subexonIdx[j] ].nextCnt > 1 )
1266 {
1267 for ( k = 0 ; k < subexons[ subexonIdx[j] ].nextCnt ; ++k )
1268 {
1269 int idx = subexons[ subexonIdx[j] ].next[k] ;
1270
1271 if ( alltranscripts[i].seVector.Test( idx ) )
1272 continue ;
1273 int l ;
1274 std::vector<int> visited ;
1275 while ( 1 )
1276 {
1277 if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 )
1278 {
1279 break ;
1280 }
1281
1282 visited.push_back( idx ) ;
1283 if ( subexons[idx].canBeEnd && subexons[idx].nextCnt == 0 )
1284 {
1285 struct _transcript nt ;
1286 nt.first = alltranscripts[i].first ;
1287 nt.last = idx ;
1288 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
1289 nt.seVector.MaskRegionOutside( nt.first, subexonIdx[j] ) ;
1290 int visitedSize = visited.size() ;
1291 for ( l = 0 ; l < visitedSize ; ++l )
1292 nt.seVector.Set( visited[l] ) ;
1293 nt.partial = false ;
1294 nt.correlationScore = 0 ;
1295 nt.abundance = 0 ;
1296 nt.constraintsSupport = NULL ;
1297
1298 alltranscripts.push_back( nt ) ;
1299 if ( alltranscripts.size() >= limit )
1300 return ;
1301 }
1302
1303 if ( subexons[idx].nextCnt == 1 )
1304 idx = subexons[idx].next[0] ;
1305 else
1306 break ;
1307 }
1308 }
1309 }
1310 }
1311
1312 // Extension towards left
1313 for ( j = 0 ; j < seIdxCnt ; ++j )
1314 {
1315 if ( subexons[ subexonIdx[j] ].prevCnt > 1 )
1316 {
1317 for ( k = 0 ; k < subexons[ subexonIdx[j] ].prevCnt ; ++k )
1318 {
1319 int idx = subexons[ subexonIdx[j] ].prev[k] ;
1320
1321 if ( alltranscripts[i].seVector.Test( idx ) )
1322 continue ;
1323 int l ;
1324 std::vector<int> visited ;
1325 while ( 1 )
1326 {
1327 if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 )
1328 {
1329 break ;
1330 }
1331
1332 visited.push_back( idx ) ;
1333 if ( subexons[idx].canBeStart && subexons[idx].prevCnt == 0 )
1334 {
1335 struct _transcript nt ;
1336 nt.first = idx ;
1337 nt.last = alltranscripts[i].last ;
1338 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
1339 nt.seVector.MaskRegionOutside( subexonIdx[j], nt.last ) ;
1340 int visitedSize = visited.size() ;
1341 for ( l = 0 ; l < visitedSize ; ++l )
1342 nt.seVector.Set( visited[l] ) ;
1343 nt.partial = false ;
1344 nt.correlationScore = 0 ;
1345 nt.abundance = 0 ;
1346 nt.constraintsSupport = NULL ;
1347
1348 alltranscripts.push_back( nt ) ;
1349 if ( alltranscripts.size() >= limit )
1350 return ;
1351 }
1352
1353 if ( subexons[idx].prevCnt == 1 )
1354 idx = subexons[idx].prev[0] ;
1355 else
1356 break ;
1357 }
1358 }
1359 }
1360 }
1361 } // for if-extend
1362 }
1363
1364 CoalesceSameTranscripts( alltranscripts ) ;
1365 }
1366
1367 // Pick the transcripts from given transcripts.
1368 void TranscriptDecider::PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints,
1369 SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts )
1370 {
1371 int i, j, k ;
1372 std::vector<int> chosen ;
1373 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
1374 int atcnt = alltranscripts.size() ;
1375 int tcCnt = tc.size() ; // transcript constraints
1376 int seCnt = 0 ;
1377
1378 if ( tcCnt == 0 )
1379 return ;
1380 if ( atcnt > 0 )
1381 seCnt = alltranscripts[0].seVector.GetSize() ;
1382 else
1383 return ;
1384
1385 double inf = -1 ; // infinity
1386 int coalesceThreshold = 1024 ;
1387 int *transcriptSeCnt = new int[ atcnt ] ;
1388 int *transcriptLength = new int[atcnt] ;
1389 double *transcriptAbundance = new double[atcnt] ; // the roughly estimated abundance based on constraints.
1390 double *avgTranscriptAbundance = new double[atcnt] ; // the average normAbund from the compatible constraints.
1391
1392 BitTable *btable = new BitTable[ atcnt ] ;
1393 //BitTable lowCovSubexon ; // force the abundance to 0 for the transcript contains the subexon.
1394 double *coveredPortion = new double[atcnt] ;
1395
1396 memset( avgTranscriptAbundance, 0 ,sizeof( double ) * atcnt ) ;
1397 for ( i = 0 ; i < atcnt ; ++i )
1398 btable[i].Init( tcCnt ) ;
1399 for ( j = 0 ; j < tcCnt ; ++j )
1400 {
1401 int a = constraints.matePairs[j].i ;
1402 int b = constraints.matePairs[j].j ;
1403
1404 if ( constraints.constraints[a].support > inf )
1405 inf = constraints.constraints[a].support ;
1406 if ( constraints.constraints[b].support > inf )
1407 inf = constraints.constraints[b].support ;
1408
1409 if ( tc[j].normAbund > inf )
1410 inf = tc[j].normAbund ;
1411
1412 tc[j].abundance = tc[j].normAbund ;
1413 }
1414 ++inf ;
1415 bool btableSet = false ;
1416 for ( i = 0 ; i < atcnt ; ++i )
1417 {
1418 //printf( "correlation %d: %lf\n", i, alltranscripts[i].correlationScore ) ;
1419 /*for ( int l = 0 ; l < subexonInd.size() ; ++l )
1420 {
1421 for ( int m = l ; m < subexonInd.size() ; ++m )
1422 printf( "%lf ", seCorrelation.Query( l, m ) ) ;
1423 printf( "\n" ) ;
1424 }*/
1425
1426 for ( j = 0 ; j < tcCnt ; ++j )
1427 {
1428 int a = tc[j].i ;
1429 int b = tc[j].j ;
1430
1431 //printf( "try set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ;
1432 //alltranscripts[i].seVector.Print() ;
1433 //constraints.constraints[a].vector.Print() ;
1434 //constraints.constraints[b].vector.Print() ;
1435 if ( IsConstraintInTranscript( alltranscripts[i], constraints.constraints[a] ) == 1
1436 && IsConstraintInTranscript( alltranscripts[i], constraints.constraints[b] ) == 1 )
1437 {
1438 //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ;
1439 btable[i].Set( j ) ;
1440 btableSet = true ;
1441 }
1442 }
1443 transcriptSeCnt[i] = alltranscripts[i].seVector.Count() ;
1444 }
1445 if ( btableSet == false )
1446 {
1447 for ( i = 0 ; i < atcnt ; ++i )
1448 btable[i].Release() ;
1449 delete[] btable ;
1450 return ;
1451 }
1452
1453 double maxAbundance = -1 ; // The abundance of the most-abundant transcript
1454 double *adjustScore = new double[atcnt] ;
1455 memset( adjustScore, 0, sizeof( double ) * atcnt ) ;
1456 if ( atcnt > 0 /*&& alltranscripts[0].abundance == -1*/ )
1457 {
1458 struct _pair32 *chain = new struct _pair32[seCnt] ;
1459 bool *covered = new bool[seCnt] ;
1460 bool *usedConstraints = new bool[constraints.constraints.size() ] ;
1461 std::vector<BitTable> togetherChain ; // those subexons is more likely to show up in the same transcript, like an IR with overhang, should be together to represent a 3'/5'-end
1462
1463 /*lowCovSubexon.Init( seCnt ) ;
1464 double *avgDepth = new double[seCnt ] ;
1465
1466 memset( avgDepth, 0, sizeof( double ) * seCnt ) ;
1467 int size = constraints.constraints.size() ;
1468 for ( i = 0 ; i < size ; ++i )
1469 {
1470 std::vector<int> subexonIdx ;
1471 constraints.constraints[i].GetOnesIndices( subexonIdx ) ;
1472 int seIdxCnt = subexonidx.size() ;
1473 for ( j = 0 ; j < seIdxCnt ; ++j )
1474 avgDepth[ subexonidx[j] ] += constraints.constraints[i].support ;
1475 }
1476 for ( i = 0 ; i < seCnt ; ++i )
1477 {
1478 if ( avgDepth[i] * alignments.readLen / (double)( subexons[i].end - subexons[i].start + 1 ) < 1 )
1479 }*/
1480
1481 struct _pair32 firstRegion, lastRegion ;
1482
1483 for ( i = 0 ; i < seCnt ; )
1484 {
1485 for ( j = i + 1 ; j < seCnt ; ++j )
1486 {
1487 if ( subexons[j].start > subexons[j - 1].end + 1 )
1488 break ;
1489 }
1490
1491
1492 int cnt = 0 ;
1493 for ( k = i ; k < j ; ++k )
1494 {
1495 if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 )
1496 || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 )
1497 || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) )
1498 ++cnt ;
1499 }
1500
1501 if ( cnt <= 1 )
1502 {
1503 i = j ;
1504 continue ;
1505 }
1506
1507 BitTable tmpTable( seCnt ) ;
1508 for ( k = i ; k < j ; ++k )
1509 {
1510 if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 )
1511 || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 )
1512 || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) )
1513 tmpTable.Set( k ) ;
1514 }
1515 togetherChain.push_back( tmpTable ) ;
1516 i = j ;
1517 }
1518
1519 for ( i = 0 ; i < atcnt ; ++i )
1520 {
1521 double value = inf ;
1522 int tag = -1 ;
1523
1524 alltranscripts[i].abundance = 0 ;
1525 alltranscripts[i].constraintsSupport = new double[tcCnt] ;
1526
1527 std::vector<int> subexonIdx ;
1528 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ;
1529 int seIdxCnt = subexonIdx.size() ;
1530 transcriptLength[i] = 0 ;
1531
1532 firstRegion.a = subexonIdx[0] ;
1533 for ( j = 1 ; j < seIdxCnt ; ++j )
1534 {
1535 if ( subexons[ subexonIdx[j] ].start > subexons[ subexonIdx[j - 1] ].end + 1 )
1536 break ;
1537 }
1538 firstRegion.b = subexonIdx[j - 1] ;
1539 lastRegion.b = subexonIdx[ seIdxCnt - 1 ] ;
1540 for ( j = seIdxCnt - 2 ; j >= 0 ; --j )
1541 {
1542 if ( subexons[ subexonIdx[j] ].end < subexons[ subexonIdx[j + 1] ].start - 1 )
1543 break ;
1544 }
1545 lastRegion.a = subexonIdx[j + 1] ;
1546
1547 for ( j = 0 ; j < seIdxCnt ; ++j )
1548 transcriptLength[i] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ;
1549
1550 //for ( j = firstRegion.b ; j < lastRegion.a ; ++j )
1551 for ( j = 0 ; j < seIdxCnt - 1 ; ++j )
1552 {
1553 chain[j].a = subexonIdx[j] ;
1554 chain[j].b = subexonIdx[j + 1] ;
1555 covered[j] = false ;
1556 }
1557 memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ;
1558 int compatibleCnt = 0 ;
1559 for ( j = 0 ; j < tcCnt ; ++j )
1560 {
1561 alltranscripts[i].constraintsSupport[j] = 0 ;
1562 if ( btable[i].Test(j) && tc[j].abundance > 0 )
1563 {
1564 ++compatibleCnt ;
1565 double adjustAbundance = tc[j].abundance ;
1566 if ( seIdxCnt > 1 )
1567 {
1568 if ( tc[j].i == tc[j].j
1569 && ( constraints.constraints[ tc[j].i ].first +
1570 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first
1571 || constraints.constraints[ tc[j].i ].first +
1572 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) )
1573 {
1574 adjustAbundance = inf ;
1575 }
1576 else if ( tc[j].i != tc[j].j
1577 && ( constraints.constraints[ tc[j].i ].first +
1578 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first
1579 || constraints.constraints[ tc[j].i ].first +
1580 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) )
1581 {
1582 adjustAbundance = constraints.constraints[ tc[j].j ].normAbund ;
1583 }
1584 else if ( tc[j].i != tc[j].j
1585 && ( constraints.constraints[ tc[j].j ].first +
1586 constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].first
1587 || constraints.constraints[ tc[j].j ].first +
1588 constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].last ) )
1589 {
1590 adjustAbundance = constraints.constraints[ tc[j].i ].normAbund ;
1591 }
1592 }
1593 if ( adjustAbundance < value )
1594 /*!( seIdxCnt > 1
1595 && ( ( ( constraints.constraints[ tc[j].i ].first >= firstRegion.a && constraints.constraints[ tc[j].i ].last <= firstRegion.b )
1596 && ( constraints.constraints[ tc[j].j ].first >= firstRegion.a && constraints.constraints[ tc[j].j ].last <= firstRegion.b ) )
1597 || ( ( constraints.constraints[ tc[j].i ].first >= lastRegion.a && constraints.constraints[ tc[j].i ].last <= lastRegion.b )
1598 && ( constraints.constraints[ tc[j].j ].first >= lastRegion.a && constraints.constraints[ tc[j].j ].last <= lastRegion.b ) ) ) )
1599 )*/
1600 {
1601 // Not use the constraints totally within the 3'/5'-end in the transcript
1602 value = adjustAbundance ;
1603 tag = j ;
1604 }
1605 avgTranscriptAbundance[i] += tc[j].abundance ;
1606
1607 if ( !usedConstraints[ tc[j].i ] )
1608 {
1609 struct _constraint &c = constraints.constraints[ tc[j].i ] ;
1610 for ( k = 0 ; k < seIdxCnt - 1 ; ++k )
1611 {
1612 // Note that since the constraint is already compatible with the txpt,
1613 // chain[k].a/b must be also adjacent in this constraint.
1614 if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) )
1615 covered[k] = true ;
1616 }
1617 usedConstraints[ tc[j].i ] = true ;
1618 }
1619
1620 if ( !usedConstraints[ tc[j].j ] )
1621 {
1622 struct _constraint &c = constraints.constraints[ tc[j].j ] ;
1623 for ( k = 0 ; k < seIdxCnt - 1 ; ++k )
1624 {
1625 if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) )
1626 covered[k] = true ;
1627 }
1628 usedConstraints[ tc[j].j ] = true ;
1629 }
1630 }
1631 }
1632
1633 // Get some penalty if something should together did not show up together
1634 int size = togetherChain.size() ;
1635 if ( size > 0 )
1636 {
1637 BitTable bufferTable( seCnt ) ;
1638 for ( j = 0 ; j < size ; ++j )
1639 {
1640 bufferTable.Assign( togetherChain[j] ) ;
1641 bufferTable.And( alltranscripts[i].seVector ) ;
1642 //if ( !bufferTable.IsAllZero() && !bufferTable.IsEqual( togetherChain[j] ) )
1643 // value /= 2 ;
1644
1645 if ( !bufferTable.IsAllZero() )
1646 {
1647 if ( bufferTable.IsEqual( togetherChain[j] ) )
1648 //printf( "nice together!\n" ) ;
1649 ;
1650 else
1651 value /= 2 ;
1652 //printf( "bad together!\n" ) ;
1653 }
1654 }
1655 bufferTable.Release() ;
1656 }
1657
1658
1659 // Every two-subexon chain should be covered by some reads if a transcript is expressed highly enough
1660 int cnt = 0 ;
1661 for ( j = 0 ; j < seIdxCnt - 1 ; ++j )
1662 if ( covered[j] == false ) // && j >= firstRegion.b && j <= lastRegion.a - 1 )
1663 {
1664 value = 0 ;
1665 }
1666 else
1667 ++cnt ;
1668 if ( seIdxCnt > 1 )
1669 coveredPortion[i] = (double)cnt / (double)( seIdxCnt - 1 ) ;
1670 else
1671 coveredPortion[i] = 1 ;
1672 if ( coveredPortion[i] == 0 )
1673 coveredPortion[i] = (double)0.5 / ( seIdxCnt ) ;
1674
1675 // For short subexon (readLength-subexon_length-1>30), we further require a constraint cover three conseuctive subexon
1676 /*memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ;
1677 for ( j = 1 ; j < seIdxCnt - 1 ; ++j )
1678 {
1679 int k = subexonIdx[j] ;
1680 if ( alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 <= 30 )
1681 continue ;
1682 // We need at least one of the side subexons are adjacent to the center one.
1683 if ( subexons[ subexonIdx[j - 1] ].end + 1 < subexons[k].start && subexons[k].end + 1 < subexons[ subexonIdx[j + 1] ].start )
1684 continue ;
1685
1686 int l = 0 ;
1687 for ( l = 0 ; l < tcCnt ; ++l )
1688 {
1689 if ( btable[i].Test(l) && tc[l].abundance > 0 )
1690 {
1691 if ( !usedConstraints[ tc[l].i ] )
1692 {
1693 struct _constraint &c = constraints.constraints[ tc[l].i ] ;
1694 if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) &&
1695 c.vector.Test( subexonIdx[j + 1] ) )
1696 break ;
1697 usedConstraints[ tc[l].i ] = true ;
1698 }
1699
1700 if ( !usedConstraints[ tc[l].j ] )
1701 {
1702 struct _constraint &c = constraints.constraints[ tc[l].j ] ;
1703 if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) &&
1704 c.vector.Test( subexonIdx[j + 1] ) )
1705 break ;
1706 usedConstraints[ tc[l].j ] = true ;
1707 }
1708 }
1709 }
1710 // It is not covered
1711 if ( l >= tcCnt )
1712 {
1713 int residual = alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 ;
1714 //printf( "residual: %d %d %lf\n", k, residual, value ) ;
1715 if ( value * residual > 2 )
1716 {
1717 value = 1 / (double)residual ;
1718 }
1719 }
1720 }*/
1721
1722 if ( tag == -1 )
1723 value = 0 ;
1724 if ( value > maxAbundance )
1725 maxAbundance = value ;
1726 transcriptAbundance[i] = value ;
1727 if ( tag != -1 )
1728 avgTranscriptAbundance[i] /= compatibleCnt ;
1729
1730 //printf( "abundance %d: %lf %lf ", i, value, avgTranscriptAbundance[i] ) ;
1731 //alltranscripts[i].seVector.Print() ;
1732 }
1733 if ( maxAbundance == 0 )
1734 {
1735 for ( i = 0 ; i < atcnt ; ++i )
1736 {
1737 transcriptAbundance[i] = coveredPortion[i] ;
1738 }
1739 maxAbundance = 1 ;
1740 }
1741 //printf( "%s: %lf\n", __func__, maxAbundance ) ;
1742 int size = togetherChain.size() ;
1743 for ( j = 0 ; j < size ; ++j )
1744 togetherChain[j].Release() ;
1745 delete[] usedConstraints ;
1746 delete[] covered ;
1747 delete[] chain ;
1748 }
1749 else
1750 {
1751 for ( i = 0 ; i < atcnt ; ++i )
1752 {
1753 transcriptAbundance[i] = alltranscripts[i].abundance ;
1754 if ( transcriptAbundance[i] > maxAbundance )
1755 maxAbundance = transcriptAbundance[i] ;
1756 coveredPortion[i] = 1 ;
1757 }
1758 if ( maxAbundance == 0 )
1759 maxAbundance = 1 ;
1760 }
1761
1762 // Obtain the prefix, suffix information of the transcripts.
1763 int *nextSuffix, *nextPrefix ;
1764 struct _pair32 *txptRank ;
1765 nextSuffix = new int[atcnt] ;
1766 nextPrefix = new int[atcnt] ;
1767 txptRank = new struct _pair32[atcnt] ;
1768 memset( nextSuffix, -1, sizeof( int ) * atcnt ) ;
1769 memset( nextPrefix, -1, sizeof( int ) * atcnt ) ;
1770 /*for ( i = 0 ; i < atcnt ; ++i )
1771 {
1772 std::vector<int> subexonIdx ;
1773 txptRank[i].a = i ;
1774 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ;
1775 txptRank[i].b = subexonIdx.size() ;
1776 }
1777 qsort( txptRank, atcnt, sizeof( struct _pair32 ), CompPairsByB) ;
1778 BitTable bufferTable( seCnt ) ;
1779 for ( i = atcnt - 1 ; i >= 0 ; --i )
1780 {
1781 int a = txptRank[i].a ;
1782 for ( j = i - 1 ; j >= 0 ; --j )
1783 {
1784 if ( txptRank[i].b == txptRank[j].b )
1785 continue ;
1786
1787 int b = txptRank[j].a ;
1788
1789 if ( alltranscripts[b].last != alltranscripts[a].last )
1790 continue ;
1791
1792 bufferTable.Assign( alltranscripts[a].seVector ) ;
1793 bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ;
1794 if ( bufferTable.IsEqual( alltranscripts[b].seVector ) )
1795 {
1796 nextSuffix[a] = b ;
1797 break ;
1798 }
1799 }
1800 }
1801 for ( i = atcnt - 1 ; i >= 0 ; --i )
1802 {
1803 int a = txptRank[i].a ;
1804 for ( j = i - 1 ; j >= 0 ; --j )
1805 {
1806 if ( txptRank[i].b == txptRank[j].b )
1807 continue ;
1808
1809 int b = txptRank[j].a ;
1810
1811 if ( alltranscripts[b].first != alltranscripts[a].first )
1812 continue ;
1813
1814 bufferTable.Assign( alltranscripts[a].seVector ) ;
1815 bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ;
1816 if ( bufferTable.IsEqual( alltranscripts[b].seVector ) )
1817 {
1818 nextPrefix[a] = b ;
1819 break ;
1820 }
1821 }
1822 }
1823
1824 bufferTable.Release() ;*/
1825 delete[] txptRank ;
1826
1827 // Quantative Set-Cover
1828 int iterCnt = -1 ;
1829 double *coverCnt = new double[atcnt] ;
1830 for ( i = 0 ; i < atcnt ; ++i )
1831 coverCnt[i] = -1 ;
1832 int *list = new int[atcnt] ;
1833 int listCnt ;
1834
1835 while ( 1 )
1836 {
1837 double max = -1 ;
1838 int maxtag = -1 ;
1839 double maxcnt = -1 ;
1840 ++iterCnt ;
1841
1842 // Find the optimal candidate.
1843 for ( i = 0 ; i < atcnt ; ++i )
1844 {
1845 double value = inf ;
1846 double cnt = 0 ;
1847
1848 if ( coverCnt[i] == -1 )
1849 {
1850 for ( j = 0 ; j < tcCnt ; ++j )
1851 {
1852 if ( tc[j].abundance > 0 && btable[i].Test( j ) )
1853 {
1854 cnt += tc[j].effectiveCount ;
1855 }
1856 }
1857 /*else
1858 {
1859 std::vector<int> tcIdx ;
1860 btable[i].GetOnesIndices( tcIdx ) ;
1861 int size = tcIdx.size() ;
1862 for ( j = 0 ; j < size ; ++j )
1863 {
1864 if ( tc[ tcIdx[j] ].abundance > 0 )
1865 {
1866 cnt += tc[ tcIdx[j] ].effectiveCount ;
1867 }
1868 }
1869 }*/
1870 coverCnt[i] = cnt ;
1871 }
1872 else
1873 {
1874 cnt = coverCnt[i] ;
1875 }
1876
1877 value = transcriptAbundance[i] ;
1878 if ( cnt < 1 ) // This transcript does not satisfy any undepleted constraints.
1879 continue ;
1880 cnt *= coveredPortion[i] ;
1881
1882 double weight = 1 ; //* seCnt / transcriptSeCnt[i] ;
1883 //if ( maxAbundance >= 1 && value / maxAbundance >= 0.2 )
1884 // seCntAdjust = sqrt( (double)( transcriptSeCnt[i] ) / seCnt ) ;//< 0.5 ? 0.5 : (double)( transcriptSeCnt[i] ) / seCnt ;
1885
1886 if ( alltranscripts[i].FPKM > 0 && sampleCnt > 1 )
1887 weight = ( 1 + alltranscripts[i].FPKM / sampleCnt ) ;
1888
1889 double score = ComputeScore( cnt, weight, value, maxAbundance, alltranscripts[i].correlationScore ) ;
1890 if ( cnt > maxcnt )
1891 maxcnt = cnt ;
1892 score += adjustScore[i] ;
1893 if ( score > max )
1894 {
1895 max = score ;
1896 maxtag = i ;
1897 }
1898 else if ( score == max )
1899 {
1900 if ( avgTranscriptAbundance[maxtag] < avgTranscriptAbundance[i] )
1901 {
1902 max = score ;
1903 maxtag = i ;
1904 }
1905 }
1906 //printf( "score: %d %lf -> %lf\n", i, cnt, score ) ;
1907 }
1908
1909 if ( maxcnt == 0 || maxtag == -1 )
1910 break ;
1911
1912 // Find the constraint that should be depleted.
1913 double update = inf ;
1914 int updateTag = 0 ;
1915 for ( j = 0 ; j < tcCnt ; ++j )
1916 {
1917 if ( btable[ maxtag ].Test( j ) && tc[j].abundance > 0 &&
1918 tc[j].abundance <= update )
1919 {
1920 update = tc[j].abundance ;
1921 updateTag = j ;
1922 }
1923 }
1924
1925 // Search suffix and prefix to see whether these fit better.
1926 int p = nextSuffix[ maxtag] ;
1927 while ( p != -1 )
1928 {
1929 if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag]
1930 && btable[p].Test( updateTag ) )
1931 {
1932 //printf( "%d\n", p ) ;
1933 maxtag = p ;
1934 break ;
1935 }
1936 p = nextSuffix[p] ;
1937 }
1938 p = nextPrefix[maxtag] ;
1939 while ( p != -1 )
1940 {
1941 if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag]
1942 && btable[p].Test( updateTag ) )
1943 {
1944 maxtag = p ;
1945 break ;
1946 }
1947 p = nextPrefix[p] ;
1948 }
1949
1950
1951 // Update the abundance.
1952 int supportCnt = 0 ;
1953 for ( j = 0 ; j < tcCnt ; ++j )
1954 {
1955 if ( btable[maxtag].Test( j ) )
1956 {
1957 if ( tc[j].abundance > 0 )
1958 {
1959 tc[j].abundance -= 1 * update ;
1960 double factor = tc[j].effectiveCount ;
1961 double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ;
1962 alltranscripts[maxtag].constraintsSupport[j] += tmp ;
1963 alltranscripts[maxtag].abundance += tmp ;
1964
1965 if ( tc[j].abundance <= 0 )
1966 {
1967 int l ;
1968 for ( l = 0 ; l < atcnt ; ++l )
1969 {
1970 if ( btable[l].Test(j) )
1971 coverCnt[l] -= tc[j].effectiveCount ;
1972 }
1973 }
1974 ++supportCnt ;
1975 }
1976 else if ( alltranscripts[maxtag].constraintsSupport[j] == 0 )
1977 {
1978 double sum = 0 ;
1979 double takeOut = 0 ;
1980 double factor = tc[j].effectiveCount ;
1981 listCnt = 0 ;
1982 for ( i = 0 ; i < atcnt ; ++i )
1983 {
1984 if ( i == maxtag )
1985 continue ;
1986
1987 if ( alltranscripts[i].abundance > 0 && btable[i].Test(j) )
1988 {
1989 sum += alltranscripts[i].constraintsSupport[j] ;
1990
1991 double tmp = ( alltranscripts[i].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) *
1992 transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] )
1993 - alltranscripts[maxtag].constraintsSupport[j] ;
1994 if ( tmp > 0 )
1995 {
1996 list[ listCnt ] = i ;
1997 ++listCnt ;
1998 takeOut += tmp ; //alltranscripts[i].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) ;
1999 }
2000 }
2001 }
2002
2003 double ratio = 1 ;
2004 double takeOutFactor = 0.5 ;
2005 if ( update < tc[j].normAbund )
2006 {
2007 if ( takeOut > ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor )
2008 ratio = ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor / takeOut ;
2009 }
2010 else
2011 {
2012 if ( takeOut > ( tc[j].support * factor ) * takeOutFactor )
2013 ratio = ( tc[j].support * factor ) * takeOutFactor / takeOut ;
2014 }
2015
2016 if ( 1 ) //update < tc[j].normAbund )
2017 {
2018 for ( i = 0 ; i < listCnt ; ++i )
2019 {
2020 //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) *
2021 // ( alltranscripts[ list[i] ].constraintsSupport[j] / sum ) ;
2022 //if ( alltranscripts[ list[i] ].constraintsSupport[j] < tmp )
2023 // printf( "WARNING! %lf %lf, %lf\n", alltranscripts[ list[i] ].constraintsSupport[j], sum, tmp ) ;
2024
2025 //double tmp = alltranscripts[ list[i] ].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) * ratio ;
2026 double tmp = ( ( alltranscripts[ list[i] ].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) *
2027 transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] )
2028 - alltranscripts[maxtag].constraintsSupport[j] ) * ratio ;
2029
2030
2031 alltranscripts[ list[i] ].constraintsSupport[j] -= tmp ;
2032 alltranscripts[ list[i] ].abundance -= tmp ;
2033 }
2034 //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ;
2035 //printf( "%lf %lf. %lf %lf\n", takeOut, ratio, update, tc[j].normAbund ) ;
2036 double tmp = takeOut * ratio ;
2037 alltranscripts[maxtag].constraintsSupport[j] += tmp ;
2038 alltranscripts[maxtag].abundance += tmp ;
2039 }
2040 /*else
2041 {
2042 double tmp = ( tc[j].support / (double)( listCnt + 1 ) ) * factor ;
2043 for ( i = 0 ; i < listCnt ; ++i )
2044 {
2045 alltranscripts[ list[i] ].abundance -= alltranscripts[ list[i] ].constraintsSupport[j] ;
2046
2047 alltranscripts[ list[i] ].constraintsSupport[j] = tmp ;
2048 alltranscripts[ list[i] ].abundance += tmp ;
2049 }
2050 alltranscripts[maxtag].constraintsSupport[j] += tmp ;
2051 alltranscripts[maxtag].abundance += tmp ;
2052 }*/
2053
2054 }
2055 }
2056
2057 if ( tc[j].abundance < 0 )
2058 {
2059 tc[j].abundance = 0 ;
2060
2061 }
2062 }
2063 tc[ updateTag ].abundance = 0 ;
2064 if ( supportCnt == 0 )
2065 break ;
2066 //adjustScore[maxtag] += 1 / (double)tcCnt ;
2067 //printf( "maxtag=%d %lf %d\n", maxtag, update, updateTag ) ;
2068 }
2069
2070 for ( i = 0 ; i < atcnt ; ++i )
2071 {
2072 if ( alltranscripts[i].abundance > 0 )
2073 {
2074 struct _transcript nt = alltranscripts[i] ;
2075 nt.seVector.Nullify() ;
2076 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
2077 nt.constraintsSupport = NULL ;
2078 if ( transcriptAbundance[i] == 0 )
2079 nt.correlationScore = -1 ;
2080 else
2081 nt.correlationScore = 0 ;
2082 nt.id = i ;
2083 transcripts.push_back( nt ) ;
2084 }
2085 }
2086
2087 // Release the memory of btable.
2088 for ( i = 0 ; i < atcnt ; ++i )
2089 {
2090 delete[] alltranscripts[i].constraintsSupport ;
2091 btable[i].Release() ;
2092 }
2093 delete[] btable ;
2094
2095 delete[] list ;
2096 delete[] transcriptSeCnt ;
2097 delete[] transcriptLength ;
2098 delete[] transcriptAbundance ;
2099 delete[] avgTranscriptAbundance ;
2100 delete[] coveredPortion ;
2101 delete[] adjustScore ;
2102 delete[] coverCnt ;
2103
2104 delete[] nextPrefix ;
2105 delete[] nextSuffix ;
2106
2107 // Redistribute weight if there is some constraints that are unbalanced.
2108 /*tcnt = transcripts.size() ;
2109 for ( i = 0 ; i < tcnt ; ++i )
2110 {
2111 int maxRatio = -1 ;
2112 for ( j = 0 ; j < tcCnt ; ++j )
2113 if ( transcripts[i].constraintsSupport[j] > 0 )
2114 {
2115 double factor = tc[j].effectiveCount ;
2116 if ( transcripts[])
2117 }
2118 }*/
2119 }
2120
2121 void TranscriptDecider::AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts )
2122 {
2123 int tcnt = transcripts.size() ;
2124 int size ;
2125 int i, j ;
2126 if ( tcnt <= 0 )
2127 return ;
2128
2129 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
2130 int tcCnt = tc.size() ; // transcript constraints
2131
2132 BitTable *btable = new BitTable[ tcnt ] ;
2133 int *transcriptLength = new int[tcnt] ;
2134 int *compatibleList = new int[tcnt] ;
2135 double *rho = new double[tcnt] ; // the abundance.
2136 int iterCnt = 0 ;
2137
2138 for ( i = 0 ; i < tcnt ; ++i )
2139 transcripts[i].constraintsSupport = new double[ tcCnt ] ;
2140
2141 for ( i = 0 ; i < tcnt ; ++i )
2142 {
2143 btable[i].Init( tcCnt ) ;
2144 double min = -1 ;
2145 for ( j = 0 ; j < tcCnt ; ++j )
2146 {
2147 int a = tc[j].i ;
2148 int b = tc[j].j ;
2149
2150 if ( IsConstraintInTranscript( transcripts[i], constraints.constraints[a] ) == 1
2151 && IsConstraintInTranscript( transcripts[i], constraints.constraints[b] ) == 1 )
2152 {
2153 //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ;
2154 btable[i].Set( j ) ;
2155
2156 if ( min == -1 || tc[j].normAbund < min )
2157 min = tc[j].normAbund ;
2158 }
2159 }
2160
2161 std::vector<int> subexonIdx ;
2162 transcripts[i].seVector.GetOnesIndices( subexonIdx ) ;
2163 int subexonIdxCnt = subexonIdx.size() ;
2164 int len = 0 ;
2165 for ( j = 0 ; j < subexonIdxCnt ; ++j )
2166 len += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ;
2167 transcriptLength[i] = len - alignments.fragLen + 2 * alignments.fragStdev ;
2168 if ( transcriptLength[i] < 1 )
2169 transcriptLength[i] = 1 ;
2170 rho[i] = transcripts[i].abundance / transcriptLength[i] ; // use the rough estimation generated before.
2171 if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen )
2172 rho[i] = 0.1 / (double)alignments.readLen ;
2173 }
2174
2175 while ( 1 )
2176 {
2177 for ( i = 0 ; i < tcnt ; ++i )
2178 for ( j = 0 ; j < tcCnt ; ++j )
2179 {
2180 transcripts[i].constraintsSupport[j] = 0 ;
2181 }
2182 for ( j = 0 ; j < tcCnt ; ++j )
2183 {
2184 int clCnt = 0 ;
2185 double sum = 0 ;
2186 for ( i = 0 ; i < tcnt ; ++i )
2187 {
2188 if ( btable[i].Test(j) )
2189 {
2190 compatibleList[ clCnt ] = i ;
2191 ++clCnt ;
2192 sum += rho[i] ;
2193 }
2194 }
2195
2196 for ( i = 0 ; i < clCnt ; ++i )
2197 {
2198 double factor = tc[j].effectiveCount ;
2199 transcripts[ compatibleList[i] ].constraintsSupport[j] = ( rho[ compatibleList[i] ] / sum ) * tc[j].support * factor ;
2200 }
2201 }
2202
2203 double diff = 0 ;
2204 for ( i = 0 ; i < tcnt ; ++i )
2205 {
2206 double newAbund = 0 ;
2207 for ( j = 0 ; j < tcCnt ; ++j )
2208 newAbund += transcripts[i].constraintsSupport[j] ;
2209 double old = rho[i] ;
2210 rho[i] = newAbund / transcriptLength[i] ;
2211 //printf( "rho[%d]=%lf\n", i, rho[i] ) ;
2212 if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen )
2213 rho[i] = 0.1 / (double)alignments.readLen ;
2214
2215 double tmp = ( old - rho[i] ) ;
2216 diff += tmp < 0 ? -tmp : tmp ;
2217 }
2218 //printf( "%lf\n", diff ) ;
2219 if ( diff < 1e-3)
2220 break ;
2221
2222 ++iterCnt ;
2223 if ( iterCnt >= 1000 )
2224 break ;
2225 }
2226
2227 for ( i = 0 ; i < tcnt ; ++i )
2228 {
2229 //printf( "%lf=>", transcripts[i].abundance ) ;
2230 transcripts[i].abundance = 0 ;
2231 for ( j = 0 ; j < tcCnt ; ++j )
2232 {
2233 transcripts[i].abundance += transcripts[i].constraintsSupport[j] ;
2234 }
2235 //printf( "%lf. (%lf)\n", transcripts[i].abundance, transcripts[i].correlationScore ) ;
2236 //transcripts[i].seVector.Print() ;
2237 }
2238
2239 for ( i = 0 ; i < tcnt ; ++i )
2240 delete[] transcripts[i].constraintsSupport ;
2241
2242 // Release the memory of btable.
2243 for ( i = 0 ; i < tcnt ; ++i )
2244 {
2245 btable[i].Release() ;
2246 }
2247 delete[] compatibleList ;
2248 delete[] btable ;
2249 delete[] transcriptLength ;
2250 delete[] rho ;
2251 }
2252
2253 int TranscriptDecider::RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive,
2254 std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints )
2255 {
2256 int i, j, k ;
2257 int tcnt = transcripts.size() ;
2258 if ( tcnt == 0 )
2259 return 0 ;
2260 int tcCnt = constraints.matePairs.size() ;
2261
2262 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
2263 std::vector<struct _constraint> &scc = constraints.constraints ; //single-end constraints.constraints
2264
2265 // Remove transcripts whose FPKM are too small.
2266 //printf( "%d %d\n", usedGeneId, baseGeneId ) ;
2267 double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ;
2268 int *geneMaxFPKMTag = new int[usedGeneId - baseGeneId ] ;
2269 double *nonOverlapMaxFPKM = new double[ usedGeneId - baseGeneId ] ; // the max FPKM among all the transcripts not overlapping with maxFPKMTag transcripts.
2270 memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
2271 memset( geneMaxFPKMTag, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ;
2272 memset( nonOverlapMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
2273
2274 double *geneMaxCov = new double[ usedGeneId - baseGeneId ] ;
2275 memset( geneMaxCov, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
2276 int *txptGid = new int[tcnt] ;
2277
2278 /*for ( i = 0 ; i < tcnt ; ++i )
2279 {
2280 printf( "%d: %lf ", i, transcripts[i].FPKM ) ;
2281 transcripts[i].seVector.Print() ;
2282 }*/
2283
2284 /*==================================================================
2285 Remove transcripts that has too few relative FPKM. (-f)
2286 ====================================================================*/
2287 for ( i = 0 ; i < tcnt ; ++i )
2288 {
2289 int gid = GetTranscriptGeneId( transcripts[i], subexons ) ;
2290 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ;
2291 //printf( "gid=%d\n", gid ) ;
2292 //printf( "%lf %lf %d\n", transcripts[i].abundance, transcripts[i].FPKM, len ) ;
2293 if ( transcripts[i].FPKM > geneMaxFPKM[gid - baseGeneId ] )
2294 {
2295 geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].FPKM ;
2296 geneMaxFPKMTag[ gid - baseGeneId ] = i ;
2297 }
2298 if ( transcripts[i].abundance * alignments.readLen / len > geneMaxCov[gid - baseGeneId ] )
2299 geneMaxCov[gid - baseGeneId] = ( transcripts[i].abundance * alignments.readLen ) / len ;
2300 txptGid[i] = gid ;
2301 }
2302
2303 for ( i = 0 ; i < tcnt ; ++i )
2304 {
2305 int tag = txptGid[i] - baseGeneId ;
2306 if ( ( transcripts[i].last < transcripts[ geneMaxFPKMTag[ tag ] ].first
2307 || transcripts[i].first > transcripts[ geneMaxFPKMTag[tag] ].last ) && transcripts[i].FPKM > nonOverlapMaxFPKM[tag] )
2308 nonOverlapMaxFPKM[tag] = transcripts[i].FPKM ;
2309 }
2310 BitTable bufferTable ;
2311 bufferTable.Duplicate( transcripts[0].seVector ) ;
2312
2313 if ( !aggressive )
2314 {
2315 // Rescue the transcripts covering unique constraints.
2316 int cnt = 0 ;
2317 int tag = 0 ;
2318 int *uniqCount = new int[tcnt] ;
2319 memset( uniqCount, 0, sizeof( int ) * tcnt ) ;
2320 for ( j = 0 ; j < tcCnt ; ++j )
2321 {
2322 cnt = 0 ;
2323 if ( tc[j].uniqSupport <= 5 )
2324 continue ;
2325 for ( i = 0 ; i < tcnt ; ++i )
2326 {
2327 if ( IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) &&
2328 IsConstraintInTranscript( transcripts[i], scc[ tc[j].j] ) )
2329 {
2330 tag = i ;
2331 ++cnt ;
2332 }
2333 if ( cnt >= 2 )
2334 break ;
2335 }
2336 if ( cnt == 1 )
2337 {
2338 ++uniqCount[tag] ;
2339 }
2340 }
2341 for ( i = 0 ; i < tcnt ; ++i )
2342 {
2343 if ( uniqCount[i] >= 2 )
2344 {
2345 transcripts[i].abundance *= 4 ;
2346 transcripts[i].FPKM *= 4 ;
2347 }
2348 }
2349
2350 delete[] uniqCount ;
2351 }
2352
2353 int sccCnt = scc.size() ;
2354 double filterFactor = 1.0 ;
2355
2356 for ( i = 0 ; i < tcnt ; ++i )
2357 {
2358 //printf( "%d: %lf %lf\n", txptGid[i], transcripts[i].abundance, geneMaxFPKM[ txptGid[i] - baseGeneId ] ) ;
2359
2360 if ( transcripts[i].FPKM < filterFactor * FPKMFraction * geneMaxFPKM[ txptGid[i] - baseGeneId ] )
2361 {
2362 /*int cnt = 0 ;
2363 int coverCnt = 0 ;
2364 for ( j = 0 ; j < tcCnt ; ++j )
2365 {
2366 if ( transcripts[i].constraintsSupport[j] > 0 )
2367 ++coverCnt ;
2368 double factor = tc[j].effectiveCount ;
2369 if ( transcripts[i].constraintsSupport[j] >= factor * tc[j].support - 1e-3
2370 && tc[j].support >= 10
2371 && tc[j].uniqSupport >= 0.95 * tc[j].support )
2372 {
2373 ++cnt ;
2374 }
2375 }
2376 //cnt = 0 ;
2377 if ( cnt >= 2 )
2378 {
2379 ;
2380 }
2381 else*/
2382 transcripts[i].abundance = -transcripts[i].abundance ;
2383 }
2384 //if ( transcripts[i].FPKM >= 0.8 * geneMaxFPKM[ txptGid[i] - baseGeneId ] && geneMaxCov[ txptGid[i] - baseGeneId ] >= txptMinReadDepth )
2385 // continue ;
2386 }
2387
2388 if ( nonOverlapMaxFPKM != 0 )
2389 {
2390 // Go two iterations to rescue, the first iteration should be just for marking.
2391 std::vector<int> rescueList ;
2392 for ( i = 0 ; i < tcnt ; ++i )
2393 {
2394 if ( transcripts[i].abundance >= 0 )
2395 continue ;
2396
2397 for ( j = 0 ; j < tcnt ; ++j )
2398 {
2399 if ( transcripts[j].abundance < 0 || txptGid[i] != txptGid[j] )
2400 continue ;
2401 if ( transcripts[i].first <= transcripts[j].last && transcripts[i].last >= transcripts[j].first )
2402 /*bufferTable.Assign( transcripts[i].seVector ) ;
2403 bufferTable.And( transcripts[j].seVector ) ;
2404
2405 if ( !bufferTable.IsAllZero() )*/
2406 break ;
2407 }
2408 if ( j >= tcnt && transcripts[i].FPKM >= FPKMFraction * nonOverlapMaxFPKM[ txptGid[i] - baseGeneId ] )
2409 {
2410 //transcripts[i].abundance = -transcripts[i].abundance ;
2411 rescueList.push_back( i ) ;
2412 }
2413 }
2414
2415 int size = rescueList.size() ;
2416 for ( i = 0 ; i < size ; ++i )
2417 transcripts[ rescueList[i] ].abundance *= -1 ;
2418 }
2419
2420 /*==================================================================
2421 Remove transcripts that has too few read coverage (-d)
2422 ====================================================================*/
2423 for ( i = 0 ; i < tcnt ; ++i )
2424 {
2425 if ( transcripts[i].abundance >= 0 )
2426 {
2427 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ;
2428 double cov = ( transcripts[i].abundance * alignments.readLen ) / len ;
2429 //printf( "%d: %d %d %lf %lf\n", i, len, transcripts[i].seVector.Count(), cov, geneMaxCov[ txptGid[i] - baseGeneId ] ) ;
2430
2431 if ( ( tcnt > 1 || len <= 1000 || transcripts[i].seVector.Count() <= 3 ) && cov < txptMinReadDepth )
2432 {
2433 //if ( usedGeneId == baseGeneId + 1 && /*transcripts[i].seVector.Count() > 3
2434 // && len > 1000 &&*/ geneMaxCov[ txptGid[i] - baseGeneId ] == cov )
2435 if ( geneMaxCov[ txptGid[i] - baseGeneId ] == cov )
2436 continue ;
2437
2438 // Test whether it has some very abundant constraints.
2439 /*int cnt = 0 ;
2440 for ( j = 0 ; j < tcCnt ; ++j )
2441 {
2442 if ( transcripts[i].constraintsSupport[j] >= tc[j].support / 2.0
2443 && tc[j].support >= 10
2444 && tc[j].uniqSupport >= 0.95 * tc[j].support
2445 && tc[j].normAbund >= 1 )
2446 {
2447 ++cnt ;
2448 }
2449 }
2450
2451 if ( cnt >= 1 )
2452 {
2453 continue ;
2454 }*/
2455
2456 // Test whether this transcript is fully covered. If so ,we can filter it.
2457
2458 if ( geneMaxCov[ txptGid[i] - baseGeneId ] <= 5 )
2459 {
2460 bufferTable.Reset() ;
2461 for ( j = 0 ; j < sccCnt ; ++j )
2462 {
2463 if ( !IsConstraintInTranscript( transcripts[i], scc[j] ) )
2464 continue ;
2465 bufferTable.Or( scc[j].vector ) ;
2466 }
2467 if ( bufferTable.IsEqual( transcripts[i].seVector ) )
2468 transcripts[i].abundance = -transcripts[i].abundance ;
2469 }
2470 else
2471 transcripts[i].abundance = -transcripts[i].abundance ;
2472
2473 /*else
2474 {
2475 transcripts[i].seVector.Print() ;
2476 bufferTable.Print() ;
2477 OutputTranscript( stderr, subexons, transcripts[i] ) ;
2478 }*/
2479 }
2480 }
2481 }
2482
2483 /*==================================================================
2484 Remove transcripts that is too short
2485 ====================================================================*/
2486 for ( i = 0 ; i < tcnt ; ++i )
2487 {
2488 if ( transcripts[i].abundance <= 0 )
2489 continue ;
2490
2491 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ;
2492 if ( len < 200 )
2493 {
2494 transcripts[i].abundance = -transcripts[i].abundance ;
2495 }
2496 }
2497
2498 // Rescue transcripts that showed up in many samples.
2499 /*for ( i = 0 ; i < tcnt ; ++i )
2500 {
2501 if ( transcripts[i].abundance > 0 )
2502 continue ;
2503 if ( txptSampleSupport[ transcripts[i].id ] >= 3 &&
2504 txptSampleSupport[transcripts[i].id ] >= (int)( sampleCnt / 2 ) )
2505 transcripts[i].abundance = -transcripts[i].abundance ;
2506 }*/
2507
2508 // Rescue some transcripts covering subexon chains showed up in many samples, but missing after filtration.
2509 struct _constraint tmpC ;
2510 tmpC.vector.Init( seCnt ) ;
2511
2512 std::vector< struct _pair32 > missingChain ;
2513 std::vector<int> recoverCandidate ;
2514 bool *used = new bool[tcnt] ;
2515 memset( used, false, sizeof( bool ) * tcnt ) ;
2516
2517 // Obtain the list of transcripts that should be recovered.
2518 for ( i = 0 ; i < seCnt && sampleCnt > 1 ; ++i )
2519 {
2520
2521 double maxFPKM = -1 ;
2522 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ;
2523 it != subexonChainSupport[i].end() ; ++it )
2524 {
2525 if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.5 * sampleCnt ) ) && it->second <= sampleCnt / 2 )
2526 continue ;
2527
2528 bool recover = true ;
2529 tmpC.vector.Reset() ;
2530 tmpC.vector.Set( i ) ;
2531 tmpC.vector.Set( it->first ) ;
2532 tmpC.first = i ;
2533 tmpC.last = it->first ;
2534
2535
2536 for ( j = 0 ; j < tcnt ; ++j )
2537 {
2538 if ( transcripts[j].abundance < 0 )
2539 continue ;
2540
2541 if ( IsConstraintInTranscript( transcripts[j], tmpC ) )
2542 {
2543 recover = false ;
2544 break ;
2545 }
2546
2547 if ( recover )
2548 {
2549 for ( j = 0 ; j < tcnt ; ++j )
2550 {
2551 if ( transcripts[j].abundance > 0 )
2552 continue ;
2553 //printf( "%d %lf\n", IsConstraintInTranscript( transcripts[j], tmpC ), transcripts[j].FPKM ) ;
2554 if ( IsConstraintInTranscript( transcripts[j], tmpC ) )
2555 {
2556 /*if ( maxTag == -1 )
2557 maxTag = j ;
2558 else
2559 {
2560 if ( txptSampleSupport[ transcripts[j].id ] > txptSampleSupport[ transcripts[maxTag ].id ] )
2561 maxTag = j ;
2562 else if ( txptSampleSupport[ transcripts[j].id ] == txptSampleSupport[ transcripts[maxTag ].id ])
2563 {
2564 if ( transcripts[j].FPKM > transcripts[maxTag].FPKM )
2565 maxTag = j ;
2566 }
2567 }*/
2568
2569 struct _pair32 np ;
2570 np.a = i ; np.b = it->first ;
2571 missingChain.push_back( np ) ;
2572
2573 if ( !used[j] )
2574 {
2575 recoverCandidate.push_back( j ) ;
2576 used[j] = true ;
2577 }
2578 }
2579 }
2580
2581 /*if ( maxTag != -1 && txptSampleSupport[ transcripts[maxTag].id ] > 1 )
2582 {
2583 //printf( "recover %d %d\n", maxTag, txptSampleSupport[ transcripts[maxTag].id ] ) ;
2584 transcripts[maxTag].abundance *= -1 ;
2585 }*/
2586 }
2587 }
2588
2589 }
2590 }
2591
2592 int size = recoverCandidate.size() ;
2593 memset( used, false, sizeof( bool ) * tcnt ) ;
2594 // Recover the candidates in the order of reliability
2595 int *geneRecoverCnt = new int[ usedGeneId - baseGeneId ] ;
2596 memset( geneRecoverCnt, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ;
2597 int round = 1 ;
2598 if ( aggressive && size > 1 )
2599 round = 1 ;
2600
2601 for ( i = 0 ; i < size ; ++i )
2602 {
2603 int maxTag = -1 ;
2604 int maxCover = -1 ;
2605 for ( j = 0 ; j < size ; ++j )
2606 {
2607 if ( !used[ recoverCandidate[j] ] )
2608 {
2609 /*int cover = 0 ;
2610
2611 k = missingChain.size() ;
2612 int l ;
2613 for ( l = 0 ; l < k ; ++l )
2614 {
2615 if ( missingChain[l].a == -1 )
2616 continue ;
2617
2618 tmpC.vector.Reset() ;
2619 tmpC.vector.Set( missingChain[l].a ) ;
2620 tmpC.vector.Set( missingChain[l].b ) ;
2621 tmpC.first = missingChain[l].a ;
2622 tmpC.last = missingChain[l].b ;
2623
2624 if ( IsConstraintInTranscript( transcripts[ recoverCandidate[j] ], tmpC ) )
2625 {
2626 ++cover ;
2627 }
2628 }*/
2629
2630 if ( maxTag == -1 )
2631 {
2632 maxTag = recoverCandidate[j] ;
2633 //maxCover = cover ;
2634 continue ;
2635 }
2636
2637 /*if ( cover > maxCover )
2638 {
2639 maxTag = recoverCandidate[j] ;
2640 maxCover = cover ;
2641 }
2642 else if ( cover == maxCover )
2643 {*/
2644 if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] >
2645 txptSampleSupport[
2646 transcripts[ maxTag ].id
2647 ] )
2648 maxTag = recoverCandidate[j] ;
2649 else if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] ==
2650 txptSampleSupport[ transcripts[ maxTag ].id ] )
2651 {
2652 if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM )
2653 maxTag = recoverCandidate[j] ;
2654 }
2655
2656 /*else if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM )
2657 maxTag = recoverCandidate[j] ;
2658 else if ( transcripts[ recoverCandidate[j] ].FPKM == transcripts[ maxTag ].FPKM )
2659 {
2660 if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] >
2661 txptSampleSupport[ transcripts[ maxTag ].id ] )
2662 maxTag = recoverCandidate[j] ;
2663 }*/
2664 //}
2665 }
2666 }
2667
2668 if ( maxTag == -1 || txptSampleSupport[ transcripts[ maxTag ].id ] <= 2
2669 || txptSampleSupport[ transcripts[maxTag].id ] < 0.5 * sampleCnt )
2670 break ;
2671
2672 used[maxTag] = true ;
2673 if ( geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] >= round )
2674 continue ;
2675 ++geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] ;
2676
2677 k = missingChain.size() ;
2678 int cnt = 0 ;
2679 for ( j = 0 ; j < k ; ++j )
2680 {
2681 if ( missingChain[j].a == -1 )
2682 continue ;
2683
2684 tmpC.vector.Reset() ;
2685 tmpC.vector.Set( missingChain[j].a ) ;
2686 tmpC.vector.Set( missingChain[j].b ) ;
2687 tmpC.first = missingChain[j].a ;
2688 tmpC.last = missingChain[j].b ;
2689
2690 if ( IsConstraintInTranscript( transcripts[maxTag], tmpC ) )
2691 {
2692 missingChain[j].a = -1 ;
2693 ++cnt ;
2694 }
2695 }
2696
2697 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[maxTag].abundance, transcripts[maxTag].FPKM ) ;
2698 double cov = ( transcripts[maxTag].abundance * alignments.readLen ) / len ;
2699 if ( cnt >= 1 && cov > 1.0 )
2700 {
2701 transcripts[maxTag].abundance *= -1 ;
2702 }
2703 }
2704 delete[] used ;
2705 delete[] geneRecoverCnt ;
2706 tmpC.vector.Release() ;
2707
2708
2709 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ;
2710
2711
2712
2713 delete []geneMaxCov ;
2714 bufferTable.Release() ;
2715 delete []geneMaxFPKM ;
2716 delete []geneMaxFPKMTag ;
2717 delete []nonOverlapMaxFPKM ;
2718 delete []txptGid ;
2719
2720 /*==================================================================
2721 Remove transcripts that seems duplicated
2722 ====================================================================*/
2723 for ( i = 0 ; i < tcnt ; ++i )
2724 {
2725 int support = 0 ;
2726 int uniqSupport = 0 ;
2727
2728 for ( j = 0 ; j < tcCnt ; ++j )
2729 {
2730 if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) )
2731 continue ;
2732 //support += scc[ tc[j].i ].support + scc[ tc[j].j ].support ;
2733 //uniqSupport += scc[ tc[j].i ].uniqSupport + scc[ tc[j].j ].uniqSupport ;
2734 support += tc[j].support ;
2735 uniqSupport += tc[j].uniqSupport ;
2736
2737 //printf( "constraint uniqness: %d: %d %d\n", i, tc[j].uniqSupport, tc[j].support ) ;
2738 }
2739 //printf( "%d: %d %d\n", i, uniqSupport, support ) ;
2740 if ( (double)uniqSupport < 0.03 * support )
2741 transcripts[i].abundance = -1 ;
2742 }
2743 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ;
2744
2745
2746 /*==================================================================
2747 Remove shadow transcripts, the abnormal 2-exon txpt whose intron is very close to the true one or one of the anchor exon is shorter than 25bp....
2748 ====================================================================*/
2749 int minusCnt = 0, plusCnt = 0 ;
2750 int mainStrand ;
2751 for ( i = 0 ; i < seCnt ; ++i )
2752 {
2753 if ( subexons[i].rightStrand == 1 )
2754 ++plusCnt ;
2755 else if ( subexons[i].rightStrand == -1 )
2756 ++minusCnt ;
2757 }
2758 if ( plusCnt > minusCnt )
2759 mainStrand = 1 ;
2760 else
2761 mainStrand = -1 ;
2762
2763 for ( i = 0 ; i < tcnt ; ++i )
2764 {
2765 std::vector<int> subexonIdx ;
2766 transcripts[i].seVector.GetOnesIndices( subexonIdx ) ;
2767 int size = subexonIdx.size() ;
2768 int intronCnt = 0 ;
2769 int anchorIdx = 0 ; // the subexon adjacent to the only intron.
2770
2771 for ( j = 0 ; j < size - 1 ; ++j )
2772 {
2773 if ( subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start )
2774 {
2775 ++intronCnt ;
2776 anchorIdx = j ;
2777 }
2778 }
2779 if ( intronCnt != 1 )
2780 continue ;
2781
2782 int anchorExonLength[2] = {0, 0};
2783 int tag = 0 ;
2784 for ( j = 0 ; j < size ; ++j )
2785 {
2786 anchorExonLength[tag] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ;
2787 if ( tag == 0 && subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start )
2788 ++tag ;
2789 }
2790
2791 int flag = 0 ;
2792 if ( subexons[ subexonIdx[anchorIdx] ].rightStrand == mainStrand )
2793 {
2794 j = subexonIdx[ anchorIdx ] ;
2795 if ( subexons[j].end - subexons[j].start + 1 <= 20 ||
2796 ( subexons[j+ 1].start == subexons[j].end + 1 && subexons[j + 1].end - subexons[j + 1].start + 1 <= 20
2797 && subexons[j + 1].rightStrand == mainStrand ) )
2798 ++flag ;
2799 j = subexonIdx[ anchorIdx + 1 ] ;
2800 if ( subexons[j].end - subexons[j].start + 1 <= 20 ||
2801 ( subexons[j].start == subexons[j - 1].end + 1 && subexons[j - 1].end - subexons[j - 1].start + 1 <= 20
2802 && subexons[j - 1].leftStrand == mainStrand ) )
2803 ++flag ;
2804 }
2805
2806 if ( anchorExonLength[0] <= 25 || anchorExonLength[1] <= 25 )
2807 flag = 2 ;
2808
2809 // the alignment support the intron must be unique and has enough support.
2810 int support = 0 ;
2811 int uniqSupport = 0 ;
2812 for ( j = 0 ; j < tcCnt ; ++j )
2813 {
2814 if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) )
2815 continue ;
2816 if ( ( scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) )
2817 || ( scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) )
2818 {
2819 support += tc[j].support ;
2820 uniqSupport += tc[j].uniqSupport ;
2821 }
2822
2823 }
2824
2825 if ( (double)uniqSupport < 0.3 * support || support < txptMinReadDepth )
2826 {
2827 flag = 2 ;
2828 }
2829
2830 if ( flag == 2 )
2831 transcripts[i].abundance = -1 ;
2832
2833 }
2834 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ;
2835
2836 return transcripts.size() ;
2837 }
2838
2839 void TranscriptDecider::ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts )
2840 {
2841 int i, j ;
2842 int tcnt = transcripts.size() ;
2843 struct _constraint tmpC ;
2844 tmpC.vector.Init( seCnt ) ;
2845
2846 for ( i = 0 ; i < tcnt ; ++i )
2847 transcripts[i].correlationScore = 0 ;
2848
2849 for ( i = 0 ; i < seCnt ; ++i )
2850 {
2851 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ;
2852 it != subexonChainSupport[i].end() ; ++it )
2853 {
2854 if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.1 * sampleCnt ) ) && it->second <= sampleCnt / 2 )
2855 continue ;
2856
2857 tmpC.vector.Reset() ;
2858 tmpC.vector.Set( i ) ;
2859 tmpC.vector.Set( it->first ) ;
2860 tmpC.first = i ;
2861 tmpC.last = it->first ;
2862
2863 for ( j = 0 ; j < tcnt ; ++j )
2864 {
2865 if ( IsConstraintInTranscript( transcripts[j], tmpC ) )
2866 ++transcripts[j].correlationScore ;
2867 }
2868 }
2869 }
2870
2871 tmpC.vector.Release() ;
2872 }
2873
2874 int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation )
2875 {
2876 int i, j, k ;
2877 int cnt = 0 ;
2878 int *f = new int[seCnt] ; // this is a general buffer for a type of usage.
2879 bool useDP = false ;
2880
2881 compatibleTestVectorT.Init( seCnt ) ; // this is the bittable used in compatible test function.
2882 compatibleTestVectorC.Init( seCnt ) ;
2883
2884 for ( i = 0 ; i < seCnt ; ++i )
2885 {
2886 subexons[i].canBeStart = subexons[i].canBeEnd = false ;
2887
2888 if ( subexons[i].prevCnt == 0 )
2889 subexons[i].canBeStart = true ;
2890 else if ( subexons[i].leftClassifier < canBeSoftBoundaryThreshold && subexons[i].leftClassifier != -1
2891 && subexons[i].leftStrand != 0 ) // The case of overhang.
2892 {
2893 // We then look into whether there is a left-side end already showed up before this subexon in this region of subexons.
2894 bool flag = true ;
2895 for ( j = i - 1 ; j >= 0 ; --j )
2896 {
2897 if ( subexons[j].end + 1 != subexons[j + 1].start )
2898 break ;
2899 if ( subexons[i].canBeStart == true )
2900 {
2901 flag = false ;
2902 break ;
2903 }
2904 }
2905 subexons[i].canBeStart = flag ;
2906 }
2907
2908 if ( subexons[i].nextCnt == 0 )
2909 subexons[i].canBeEnd = true ;
2910 else if ( subexons[i].rightClassifier < canBeSoftBoundaryThreshold && subexons[i].rightClassifier != -1
2911 && subexons[i].rightStrand != 0 )
2912 {
2913 subexons[i].canBeEnd = true ;
2914 }
2915 // Remove other soft end already showed up in this region of subexons.
2916 if ( subexons[i].canBeEnd == true )
2917 {
2918 for ( j = i - 1 ; j >= 0 ; --j )
2919 {
2920 if ( subexons[j].end + 1 != subexons[j + 1].start )
2921 break ;
2922 if ( subexons[j].canBeEnd == true )
2923 {
2924 subexons[j].canBeEnd = false ;
2925 break ;
2926 }
2927 }
2928 }
2929 //printf( "%d: %d %lf\n", subexons[i].canBeStart, subexons[i].prevCnt, subexons[i].leftClassifier ) ;
2930 }
2931
2932 // Go through the cases of mixture region to set canBeStart/End.
2933 // e.g: +[...]+_____+[....]-...]+____+[..)_____-[...]-
2934 // ^ then we need to force a start point here.
2935 // Do we need to associate a strand information with canBeStart, canBeEnd?
2936 for ( i = 0 ; i < seCnt ; )
2937 {
2938 // [i, j) is a region.
2939 for ( j = i + 1 ; j < seCnt ; ++j )
2940 if ( subexons[j].start > subexons[j - 1].end + 1 )
2941 break ;
2942 if ( subexons[i].canBeStart == false ) // then subexons[i] must has a hard left boundary.
2943 {
2944 int leftStrandCnt[2] = {0, 0} ;
2945 for ( k = i ; k < j ; ++k )
2946 {
2947 if ( !SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) )
2948 break ;
2949 if ( subexons[k].leftStrand != 0 )
2950 ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
2951 }
2952 if ( k < j && leftStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] == 0 )
2953 subexons[i].canBeStart = true ;
2954 }
2955
2956 if ( subexons[j - 1].canBeEnd == false )
2957 {
2958 int rightStrandCnt[2] = {0, 0} ;
2959 for ( k = j - 1 ; k >= i ; --k )
2960 {
2961 if ( !SubexonGraph::IsSameStrand( subexons[k].leftStrand, subexons[j - 1].rightStrand ) )
2962 break ;
2963 if ( subexons[k].rightStrand != 0 )
2964 ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
2965 }
2966 if ( k >= i && rightStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] == 0 )
2967 subexons[j - 1].canBeEnd = true ;
2968 }
2969
2970 //if ( subexons[i].start == 6870264)
2971 // printf( "hi %d %d\n",i , subexons[i].canBeStart ) ;
2972 i = j ;
2973 }
2974 /*for ( i = 0 ; i < seCnt ; ++i )
2975 {
2976 printf( "%d %d: %d %d\n", subexons[i].start, subexons[i].end, subexons[i].canBeStart, subexons[i].canBeEnd ) ;
2977 }*/
2978
2979 // Find the gene ids.
2980 baseGeneId = subexons[0].lcCnt ;
2981 usedGeneId = subexons[0].rcCnt ;
2982 defaultGeneId[0] = defaultGeneId[1] = -1 ;
2983 for ( i = 0 ; i < seCnt ; ++i )
2984 {
2985 if ( subexons[i].geneId < 0 )
2986 continue ;
2987
2988 //if ( baseGeneId == -1 || subexons[i].geneId < baseGeneId )
2989 // baseGeneId = subexons[i].geneId ;
2990 //if ( subexons[i].geneId > usedGeneId )
2991 // usedGeneId = subexons[i].geneId ;
2992
2993 if ( ( subexons[i].rightStrand == -1 || subexons[i].leftStrand == -1 ) &&
2994 ( defaultGeneId[0] == -1 || subexons[i].geneId < defaultGeneId[0] ) )
2995 defaultGeneId[0] = subexons[i].geneId ;
2996 if ( ( subexons[i].rightStrand == 1 || subexons[i].leftStrand == 1 ) &&
2997 ( defaultGeneId[1] == -1 || subexons[i].geneId < defaultGeneId[1] ) )
2998 defaultGeneId[1] = subexons[i].geneId ;
2999 }
3000 if ( defaultGeneId[0] == -1 )
3001 defaultGeneId[0] = baseGeneId ;
3002 if ( defaultGeneId[1] == -1 )
3003 defaultGeneId[1] = usedGeneId - 1 ;
3004
3005 // Go through the constraints to find the chain of subexons that should be kept.
3006 std::map<int, int> *subexonChainSupport = new std::map<int, int>[ seCnt ] ;
3007 for ( i = 0 ; i < sampleCnt ; ++i )
3008 {
3009 std::vector<int> subexonIdx ;
3010 std::vector<struct _pair32> chain ;
3011
3012 int tcCnt = constraints[i].constraints.size() ;
3013 int size ;
3014 for ( j = 0 ; j < tcCnt ; ++j )
3015 {
3016 struct _constraint c = constraints[i].constraints[j] ;
3017 if ( c.uniqSupport < 0.95 * c.support || c.support < 3 )
3018 continue ;
3019
3020 subexonIdx.clear() ;
3021 c.vector.GetOnesIndices( subexonIdx ) ;
3022 size = subexonIdx.size() ;
3023
3024 for ( k = 0 ; k < size - 1 ; ++k )
3025 {
3026 struct _pair32 p ;
3027
3028 p.a = subexonIdx[k] ;
3029 p.b = subexonIdx[k + 1] ;
3030 //if ( subexons[p.a].end + 1 == 113235898 && subexons[ p.b ].start + 1 == 113236121 )
3031 // printf( "bad bad %d %d %d\n", i, c.uniqSupport, c.support ) ;
3032
3033 if ( subexons[ p.a ].end + 1 < subexons[ p.b ].start )
3034 chain.push_back( p ) ;
3035 }
3036 }
3037 // Remove redundancy.
3038 sort( chain.begin(), chain.end(), CompSortPairs ) ;
3039 size = chain.size() ;
3040 k = 0 ;
3041 for ( j = 1 ; j < size ; ++j )
3042 {
3043 if ( chain[j].a == chain[k].a && chain[j].b == chain[k].b )
3044 continue ;
3045 else
3046 {
3047 ++k ;
3048 chain[k] = chain[j] ;
3049 }
3050 }
3051 chain.resize( k + 1 ) ;
3052
3053 // Add those to sample count
3054 size = k + 1 ;
3055 for ( j = 0 ; j < size ; ++j )
3056 {
3057 if ( subexonChainSupport[ chain[j].a ].count( chain[j].b ) )
3058 {
3059 ++subexonChainSupport[ chain[j].a ][ chain[j].b ] ;
3060 }
3061 else
3062 subexonChainSupport[ chain[j].a ][ chain[j].b ] = 1 ;
3063 }
3064 }
3065
3066 /*for ( i = 0 ; i < seCnt ; ++i )
3067 {
3068 printf( "%d:", i ) ;
3069 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; it != subexonChainSupport[i].end() ; ++it )
3070 printf( " (%d %d) ", it->first, it->second ) ;
3071 printf( "\n" ) ;
3072 }*/
3073
3074 //printf( "%d %d %d\n", defaultGeneId[0], baseGeneId, usedGeneId ) ;
3075 cnt = 0 ;
3076 memset( f, -1, sizeof( int ) * seCnt ) ;
3077 for ( i = 0 ; i < seCnt ; ++i )
3078 {
3079 if ( subexons[i].canBeStart )
3080 {
3081 cnt += SubTranscriptCount( i, subexons, f ) ;
3082 }
3083 }
3084 if ( cnt <= USE_DP )
3085 {
3086 for ( i = 0 ; i < seCnt ; ++i )
3087 if ( f[i] > USE_DP )
3088 {
3089 useDP = true ;
3090 break ;
3091 }
3092 }
3093 else
3094 useDP = true ;
3095 if ( !useDP )
3096 {
3097 for ( i = 0 ; i < sampleCnt ; ++i )
3098 {
3099 double msize = constraints[i].matePairs.size() ;
3100 double csize = constraints[i].constraints.size() ;
3101 if ( cnt > ( csize / msize ) * ( csize / msize ) * seCnt
3102 && cnt > USE_DP / ( msize * msize ) && cnt > 50 )
3103 {
3104 useDP = true ;
3105 break ;
3106 }
3107 }
3108 }
3109
3110 int atCnt = cnt ;
3111 printf( "%d: atCnt=%d seCnt=%d %d %d %d\n", subexons[0].start + 1, atCnt, seCnt, useDP, (int)constraints[0].constraints.size(), (int)constraints[0].matePairs.size() ) ;
3112 fflush( stdout ) ;
3113 std::vector<struct _transcript> alltranscripts ;
3114
3115 if ( !useDP )
3116 {
3117 int origSize = atCnt ;
3118 alltranscripts.resize( atCnt ) ;
3119 for ( i = 0 ; i < atCnt ; ++i )
3120 {
3121 alltranscripts[i].seVector.Init( seCnt ) ;
3122 alltranscripts[i].correlationScore = 1 ;
3123 }
3124
3125 atCnt = 0 ;
3126 for ( i = 0 ; i < seCnt ; ++i )
3127 {
3128 if ( subexons[i].canBeStart )
3129 EnumerateTranscript( i, 0, f, 0, subexons, subexonCorrelation, 1, alltranscripts, atCnt ) ;
3130 }
3131
3132 for ( i = atCnt ; i < origSize ; ++i )
3133 alltranscripts[i].seVector.Release() ;
3134
3135 alltranscripts.resize( atCnt ) ;
3136 //printf( "transcript cnt: %d\n", atCnt ) ;
3137 //printf( "%d %d\n", alltranscripts[0].seVector.Test( 1 ), constraints[0].matePairs.size() ) ;
3138 }
3139 else // Use dynamic programming to pick a set of candidate transcript.
3140 {
3141 std::vector<struct _transcript> sampleTranscripts ;
3142
3143 // pre allocate the memory.
3144 struct _dpAttribute attr ;
3145 attr.f1 = new struct _dp[seCnt] ;
3146 if ( seCnt <= 10000 )
3147 {
3148 attr.f2 = new struct _dp*[seCnt] ;
3149 for ( i = 0 ; i < seCnt ; ++i )
3150 attr.f2[i] = new struct _dp[seCnt] ;
3151 }
3152 else
3153 attr.f2 = NULL ;
3154
3155 hashMax = HASH_MAX ;
3156 if (seCnt > 500)
3157 hashMax = 1000003 ;
3158 else if (seCnt > 1000)
3159 hashMax = 10000019 ;
3160 else if (seCnt > 1500)
3161 hashMax = 20000003 ;
3162
3163 attr.hash = dpHash ;
3164 if ( hashMax != HASH_MAX )
3165 attr.hash = new struct _dp[hashMax] ;
3166
3167 for ( i = 0 ; i < seCnt ; ++i )
3168 {
3169 attr.f1[i].seVector.Nullify() ;
3170 attr.f1[i].seVector.Init( seCnt ) ;
3171 for ( j = i ; j < seCnt ; ++j )
3172 {
3173 attr.f2[i][j].seVector.Nullify() ;
3174 attr.f2[i][j].seVector.Init( seCnt ) ;
3175 }
3176 }
3177 for ( i = 0 ; i < hashMax ; ++i )
3178 {
3179 attr.hash[i].seVector.Nullify() ;
3180 attr.hash[i].seVector.Init( seCnt ) ;
3181 }
3182
3183 // select candidate transcripts from each sample.
3184 struct _pair32 *sampleComplexity = new struct _pair32[ sampleCnt ] ;
3185 for ( i = 0 ; i < sampleCnt ; ++i )
3186 {
3187 sampleComplexity[i].a = i ;
3188 sampleComplexity[i].b = constraints[i].constraints.size() ;
3189 }
3190 qsort( sampleComplexity, sampleCnt, sizeof( sampleComplexity[0] ), CompPairsByB ) ;
3191 int downsampleCnt = -1 ;
3192
3193 for ( i = sampleCnt - 1 ; i >= 0 ; --i )
3194 {
3195 sampleTranscripts.clear() ;
3196 int iterBound = constraints[ sampleComplexity[i].a ].constraints.size() ;
3197 if ( i < sampleCnt - 1 )
3198 iterBound = 100 ;
3199
3200 if ( i < sampleCnt - 10 && alltranscripts.size() > 1000 )
3201 iterBound = 10 ;
3202 //printf( "%d %d: %d %d %d %d\n", subexons[0].start + 1, sampleComplexity[i].a, constraints[ sampleComplexity[i].a ].constraints.size(), constraints[ sampleComplexity[i].a ].matePairs.size(),
3203 // alltranscripts.size(), iterBound ) ; fflush( stdout ) ;
3204 if ( maxDpConstraintSize > 0 )
3205 {
3206 Constraints truncatedConstraints ;
3207 truncatedConstraints.TruncateConstraintsCoverFrom( constraints[ sampleComplexity[i].a ], seCnt, maxDpConstraintSize ) ;
3208 PickTranscriptsByDP( subexons, seCnt, iterBound, truncatedConstraints,
3209 subexonCorrelation, attr, sampleTranscripts ) ;
3210 }
3211 else if ( ( constraints[ sampleComplexity[i].a ].constraints.size() > 1000
3212 && constraints[ sampleComplexity[i].a ].constraints.size() * 10 < constraints[ sampleComplexity[i].a ].matePairs.size() )
3213 || ( downsampleCnt > 0 && (int)constraints[ sampleComplexity[i].a ].constraints.size() >= downsampleCnt )
3214 || seCnt >= 1500 )
3215 {
3216 Constraints downsampledConstraints ;
3217 int stride = (int)constraints[ sampleComplexity[i].a ].matePairs.size() / (int)constraints[ sampleComplexity[i].a ].constraints.size() ;
3218 if ( downsampleCnt > 0 )
3219 stride = (int)constraints[ sampleComplexity[i].a ].constraints.size() / downsampleCnt ;
3220 if ( stride < 1 )
3221 stride = 1 ;
3222 downsampledConstraints.DownsampleConstraintsFrom( constraints[ sampleComplexity[i].a ], stride ) ;
3223 if ( downsampleCnt <= 0 )
3224 downsampleCnt = downsampledConstraints.constraints.size() ;
3225 if ( iterBound <= 10 )
3226 continue ;
3227 PickTranscriptsByDP( subexons, seCnt, iterBound, downsampledConstraints, subexonCorrelation, attr, sampleTranscripts ) ;
3228 }
3229 else
3230 {
3231 PickTranscriptsByDP( subexons, seCnt, iterBound, constraints[ sampleComplexity[i].a ], subexonCorrelation, attr, sampleTranscripts ) ;
3232 }
3233 int size = sampleTranscripts.size() ;
3234 for ( j = 0 ; j < size ; ++j )
3235 alltranscripts.push_back( sampleTranscripts[j] ) ;
3236
3237 // we can further pick a smaller subsets of transcripts here if the number is still to big.
3238 CoalesceSameTranscripts( alltranscripts ) ;
3239
3240 AugmentTranscripts( subexons, alltranscripts, 1000, false ) ;
3241 }
3242
3243 // release the memory.
3244 delete[] sampleComplexity ;
3245 for ( i = 0 ; i < seCnt ; ++i )
3246 {
3247 attr.f1[i].seVector.Release() ;
3248 for ( j = i ; j < seCnt && attr.f2 ; ++j )
3249 attr.f2[i][j].seVector.Release() ;
3250 }
3251 for ( i = 0 ; i < hashMax ; ++i )
3252 attr.hash[i].seVector.Release() ;
3253
3254 delete[] attr.f1 ;
3255 for ( i = 0 ; i < seCnt && attr.f2 ; ++i )
3256 delete[] attr.f2[i] ;
3257 delete[] attr.f2 ;
3258 if (hashMax != HASH_MAX)
3259 delete[] attr.hash ;
3260
3261 }
3262
3263 transcriptId = new int[usedGeneId - baseGeneId] ;
3264 std::vector<struct _transcript> *predTranscripts = new std::vector<struct _transcript>[sampleCnt] ;
3265
3266 atCnt = alltranscripts.size() ;
3267 for ( i = 0 ; i < atCnt ; ++i )
3268 alltranscripts[i].FPKM = 0 ;
3269
3270 for ( i = 0 ; i < sampleCnt ; ++i )
3271 {
3272 int size = alltranscripts.size() ;
3273 for ( j = 0 ; j < size ; ++j )
3274 alltranscripts[j].abundance = -1 ;
3275 //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ;
3276 PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ;
3277
3278 /*double tmp = FPKMFraction ;
3279 FPKMFraction = 0 ;
3280 size = predTranscripts.size() ;
3281 for ( j = 0 ; j < size ; ++j )
3282 {
3283 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ;
3284 }
3285 RefineTranscripts( subexons, seCnt, predTranscripts, constraints[i] ) ;
3286 FPKMFraction = tmp ;*/
3287
3288 }
3289
3290 atCnt = alltranscripts.size() ;
3291 int *txptSampleSupport = new int[atCnt] ;
3292 memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ;
3293 for ( i = 0 ; i < sampleCnt ; ++i )
3294 {
3295 int size = predTranscripts[i].size() ;
3296 for ( j = 0 ; j < size ; ++j )
3297 {
3298 ++txptSampleSupport[ predTranscripts[i][j].id ] ;
3299 ++alltranscripts[ predTranscripts[i][j].id ].FPKM ;
3300 }
3301 }
3302
3303 for ( i = 0 ; i < sampleCnt ; ++i )
3304 {
3305 int size = alltranscripts.size() ;
3306 for ( j = 0 ; j < size ; ++j )
3307 alltranscripts[j].abundance = -1 ;
3308 //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ;
3309
3310 size = predTranscripts[i].size() ;
3311 for ( j = 0 ; j < size ; ++j )
3312 {
3313 predTranscripts[i][j].seVector.Release() ;
3314 }
3315 predTranscripts[i].clear() ;
3316 PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ;
3317 }
3318
3319 std::vector<int> *rawPredTranscriptIds = new std::vector<int>[sampleCnt] ;
3320 std::vector<double> *rawPredTranscriptAbundance = new std::vector<double>[sampleCnt] ;
3321 for ( i = 0 ; i < sampleCnt ; ++i )
3322 {
3323 int size = predTranscripts[i].size() ;
3324
3325 for ( j = 0 ; j < size ; ++j )
3326 {
3327 rawPredTranscriptIds[i].push_back( predTranscripts[i][j].id ) ;
3328 rawPredTranscriptAbundance[i].push_back( predTranscripts[i][j].abundance ) ;
3329 }
3330 }
3331
3332 // Do the filtration.
3333 for ( i = 0 ; i < sampleCnt ; ++i )
3334 {
3335 int size = predTranscripts[i].size() ;
3336 for ( j = 0 ; j < size ; ++j )
3337 {
3338 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ;
3339 }
3340 size = RefineTranscripts( subexons, seCnt, false, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ;
3341
3342 // Recompute the abundance.
3343 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ;
3344 for ( j = 0 ; j < size ; ++j )
3345 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ;
3346 size = RefineTranscripts( subexons, seCnt, true, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ;
3347
3348 //ComputeTranscriptsScore( subexons, seCnt, subexonChainSupport, predTranscripts[i] ) ;
3349 }
3350
3351 // Rescue some filtered transcripts
3352 memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ;
3353 for ( i = 0 ; i < sampleCnt ; ++i )
3354 {
3355 int size = predTranscripts[i].size() ;
3356 for ( j = 0 ; j < size ; ++j )
3357 {
3358 ++txptSampleSupport[ predTranscripts[i][j].id ] ;
3359 }
3360 }
3361
3362 bool *predicted = new bool[atCnt] ;
3363 for ( i = 0 ; i < sampleCnt ; ++i )
3364 {
3365 memset( predicted, false, sizeof( bool ) * atCnt ) ;
3366 if ( predTranscripts[i].size() != rawPredTranscriptIds[i].size() )
3367 {
3368 int psize = predTranscripts[i].size() ;
3369 int rsize = rawPredTranscriptIds[i].size() ;
3370 int tcCnt = constraints[i].matePairs.size() ;
3371
3372 for ( j = 0 ; j < psize ; ++j )
3373 predicted[ predTranscripts[i][j].id ] = true ;
3374
3375 for ( j = 0 ; j < rsize ; ++j )
3376 {
3377 int id = rawPredTranscriptIds[i][j] ;
3378 if ( predicted[ id ] == false &&
3379 ( txptSampleSupport[ id ] >= 3 && txptSampleSupport[id] >= 0.25 * sampleCnt ) )
3380 {
3381 struct _transcript nt = alltranscripts[id] ;
3382 nt.seVector.Nullify() ;
3383 nt.seVector.Duplicate( alltranscripts[id].seVector ) ;
3384 nt.constraintsSupport = NULL ;
3385 nt.correlationScore = -1 ;
3386 nt.abundance = rawPredTranscriptAbundance[i][j] ;
3387 nt.id = id ;
3388 predTranscripts[i].push_back( nt ) ;
3389 }
3390 }
3391 if ( psize != predTranscripts[i].size() )
3392 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ;
3393 }
3394
3395 int size = predTranscripts[i].size() ;
3396
3397 if ( 0 ) //size == 1 )
3398 {
3399 //AugmentTranscripts( subexons, predTranscripts[i], false ) ;
3400
3401 int l = predTranscripts[i].size() ;
3402 int tcCnt = constraints[i].matePairs.size() ;
3403 for ( j = 0 ; j < l ; ++j )
3404 {
3405 predTranscripts[i][j].abundance = 1.0 / alignments.readLen ;
3406 }
3407 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ;
3408
3409 std::vector<int> subexonIdx ;
3410 for ( j = 0 ; j < l ; ++j )
3411 {
3412 subexonIdx.clear() ;
3413 predTranscripts[i][j].seVector.GetOnesIndices( subexonIdx ) ;
3414 int subexonIdxCnt = subexonIdx.size() ;
3415 int len = 0 ;
3416 for ( k = 0 ; k < subexonIdxCnt ; ++k )
3417 len += subexons[ subexonIdx[k] ].end - subexons[ subexonIdx[k] ].start + 1 ;
3418
3419 if ( predTranscripts[i][j].abundance * alignments.readLen / len < 2.0 )
3420 predTranscripts[i][j].abundance = -1 ;
3421 else
3422 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ;
3423
3424 }
3425 RemoveNegativeAbundTranscripts( predTranscripts[i] ) ;
3426 }
3427
3428 // Output
3429 size = predTranscripts[i].size() ;
3430 InitTranscriptId() ;
3431 for ( j = 0 ; j < size ; ++j )
3432 {
3433 OutputTranscript( i, subexons, predTranscripts[i][j] ) ;
3434 }
3435 for ( j = 0 ; j < size ; ++j )
3436 {
3437 predTranscripts[i][j].seVector.Release() ;
3438 }
3439 }
3440
3441 delete []predicted ;
3442 delete []transcriptId ;
3443 delete []predTranscripts ;
3444 delete []rawPredTranscriptIds ;
3445 delete []rawPredTranscriptAbundance ;
3446 delete []txptSampleSupport ;
3447
3448 atCnt = alltranscripts.size() ;
3449 for ( i = 0 ; i < atCnt ; ++i )
3450 alltranscripts[i].seVector.Release() ;
3451 compatibleTestVectorT.Release() ;
3452 compatibleTestVectorC.Release() ;
3453 delete[] f ;
3454 delete[] subexonChainSupport ;
3455 return 0 ;
3456 }
3457
3458 void *TranscriptDeciderSolve_Wrapper( void *a )
3459 {
3460 int i ;
3461
3462 struct _transcriptDeciderThreadArg &arg = *( (struct _transcriptDeciderThreadArg *)a ) ;
3463 TranscriptDecider transcriptDecider( arg.FPKMFraction, arg.classifierThreshold, arg.txptMinReadDepth, arg.sampleCnt, *( arg.alignments ) ) ;
3464 transcriptDecider.SetNumThreads( arg.numThreads + 1 ) ;
3465 transcriptDecider.SetMultiThreadOutputHandler( arg.outputHandler ) ;
3466 transcriptDecider.SetMaxDpConstraintSize( arg.maxDpConstraintSize ) ;
3467 transcriptDecider.Solve( arg.subexons, arg.seCnt, arg.constraints, arg.subexonCorrelation ) ;
3468
3469 int start = arg.subexons[0].start ;
3470 int end = arg.subexons[ arg.seCnt - 1 ].end ;
3471 int chrId = arg.subexons[0].chrId ;
3472 // Release memory
3473 for ( i = 0 ; i < arg.seCnt ; ++i )
3474 {
3475 delete[] arg.subexons[i].prev ;
3476 delete[] arg.subexons[i].next ;
3477 }
3478 delete[] arg.subexons ;
3479
3480 // Put the work id back to the free threads queue.
3481 pthread_mutex_lock( arg.ftLock ) ;
3482 arg.freeThreads[ *( arg.ftCnt ) ] = arg.tid ;
3483 ++*( arg.ftCnt ) ;
3484 if ( *( arg.ftCnt ) == 1 )
3485 pthread_cond_signal( arg.fullWorkCond ) ;
3486 pthread_mutex_unlock( arg.ftLock) ;
3487 printf( "Thread %d: %s %d %d finished.\n", arg.tid, arg.alignments->GetChromName(chrId), start + 1, end + 1 ) ;
3488 fflush( stdout ) ;
3489
3490
3491 pthread_exit( NULL ) ;
3492 }