Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/TranscriptDecider.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #include "TranscriptDecider.hpp" | |
| 2 | |
| 3 void TranscriptDecider::OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) | |
| 4 { | |
| 5 int i, j ; | |
| 6 // determine the strand | |
| 7 std::vector<int> subexonInd ; | |
| 8 transcript.seVector.GetOnesIndices( subexonInd ) ; | |
| 9 | |
| 10 // Determine the strand | |
| 11 char strand[2] = "." ; | |
| 12 int size = subexonInd.size() ; | |
| 13 if ( size > 1 ) | |
| 14 { | |
| 15 // locate the intron showed up in this transcript. | |
| 16 for ( i = 0 ; i < size - 1 ; ++i ) | |
| 17 { | |
| 18 /*int nextCnt = subexons[ subexonInd[i] ].nextCnt ; | |
| 19 if ( nextCnt == 0 ) | |
| 20 continue ; | |
| 21 | |
| 22 for ( j = 0 ; j < nextCnt ; ++j ) | |
| 23 { | |
| 24 int a = subexons[ subexonInd[i] ].next[j] ; | |
| 25 if ( subexonInd[i + 1] == a | |
| 26 && subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[... | |
| 27 { | |
| 28 break ; | |
| 29 } | |
| 30 } | |
| 31 if ( j < nextCnt )*/ | |
| 32 | |
| 33 if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start ) | |
| 34 { | |
| 35 if ( subexons[ subexonInd[i] ].rightStrand == 1 ) | |
| 36 strand[0] = '+' ; | |
| 37 else if ( subexons[ subexonInd[i] ].rightStrand == -1 ) | |
| 38 strand[0] = '-' ; | |
| 39 break ; | |
| 40 } | |
| 41 } | |
| 42 } | |
| 43 | |
| 44 // TODO: transcript_id | |
| 45 char *chrom = alignments.GetChromName( subexons[0].chrId ) ; | |
| 46 char prefix[10] = "" ; | |
| 47 struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ; | |
| 48 // Concatenate adjacent subexons | |
| 49 catSubexons[0] = subexons[ subexonInd[0] ] ; | |
| 50 j = 1 ; | |
| 51 for ( i = 1 ; i < size ; ++i ) | |
| 52 { | |
| 53 if ( subexons[ subexonInd[i] ].start == catSubexons[j - 1].end + 1 ) | |
| 54 { | |
| 55 catSubexons[j - 1].end = subexons[ subexonInd[i] ].end ; | |
| 56 } | |
| 57 else | |
| 58 { | |
| 59 catSubexons[j] = subexons[ subexonInd[i] ] ; | |
| 60 ++j ; | |
| 61 } | |
| 62 } | |
| 63 size = j ; | |
| 64 | |
| 65 int gid = GetTranscriptGeneId( subexonInd, subexons ) ; | |
| 66 if ( 0 ) //numThreads <= 1 ) | |
| 67 { | |
| 68 fprintf( outputFPs[sampleId], "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n", | |
| 69 chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand, | |
| 70 prefix, chrom, gid, | |
| 71 prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.FPKM ) ; | |
| 72 for ( i = 0 ; i < size ; ++i ) | |
| 73 { | |
| 74 fprintf( outputFPs[ sampleId ], "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; " | |
| 75 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n", | |
| 76 chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand, | |
| 77 prefix, chrom, gid, | |
| 78 prefix, chrom, gid, transcriptId[ gid - baseGeneId ], | |
| 79 i + 1, transcript.FPKM ) ; | |
| 80 } | |
| 81 } | |
| 82 else | |
| 83 { | |
| 84 struct _outputTranscript t ; | |
| 85 int len = 0 ; | |
| 86 t.chrId = subexons[0].chrId ; | |
| 87 t.geneId = gid ; | |
| 88 t.transcriptId = transcriptId[ gid - baseGeneId ] ; | |
| 89 t.FPKM = transcript.FPKM ; | |
| 90 t.sampleId = sampleId ; | |
| 91 t.exons = new struct _pair32[size] ; | |
| 92 for ( i = 0 ; i < size ; ++i ) | |
| 93 { | |
| 94 t.exons[i].a = catSubexons[i].start + 1 ; | |
| 95 t.exons[i].b = catSubexons[i].end + 1 ; | |
| 96 len += t.exons[i].b - t.exons[i].a + 1 ; | |
| 97 } | |
| 98 t.cov = transcript.abundance * alignments.readLen / len ; | |
| 99 t.ecnt = size ; | |
| 100 t.strand = strand[0] ; | |
| 101 //printf( "%lf\n", transcript.correlationScore ) ; | |
| 102 | |
| 103 if ( numThreads > 1 ) | |
| 104 outputHandler->Add( t ) ; | |
| 105 else | |
| 106 outputHandler->Add_SingleThread( t ) ; | |
| 107 } | |
| 108 ++transcriptId[ gid - baseGeneId ] ; | |
| 109 | |
| 110 delete[] catSubexons ; | |
| 111 } | |
| 112 | |
| 113 int TranscriptDecider::GetFather( int f, int *father ) | |
| 114 { | |
| 115 if ( father[f] != f ) | |
| 116 return father[f] = GetFather( father[f], father ) ; | |
| 117 return f ; | |
| 118 } | |
| 119 | |
| 120 int TranscriptDecider::GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons ) | |
| 121 { | |
| 122 int i ; | |
| 123 int size = subexonInd.size() ; | |
| 124 | |
| 125 for ( i = 0 ; i < size ; ++i ) | |
| 126 if ( subexons[ subexonInd[i] ].geneId != -2 ) | |
| 127 return subexons[ subexonInd[i] ].geneId ; | |
| 128 | |
| 129 // Some extreme case, where all the regions are mixture regions. | |
| 130 for ( i = 0 ; i < size - 1 ; ++i ) | |
| 131 if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start ) | |
| 132 { | |
| 133 return defaultGeneId[ ( subexons[ subexonInd[i] ].rightStrand + 1 ) / 2 ] ; | |
| 134 } | |
| 135 return defaultGeneId[0] ; | |
| 136 } | |
| 137 | |
| 138 int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) | |
| 139 { | |
| 140 if ( subexons[ t.first ].geneId != -2 ) | |
| 141 return subexons[ t.first ].geneId ; | |
| 142 if ( subexons[ t.last ].geneId != -2 ) | |
| 143 return subexons[ t.last ].geneId ; | |
| 144 std::vector<int> subexonInd ; | |
| 145 t.seVector.GetOnesIndices( subexonInd ) ; | |
| 146 return GetTranscriptGeneId( subexonInd, subexons ) ; | |
| 147 } | |
| 148 | |
| 149 void TranscriptDecider::InitTranscriptId() | |
| 150 { | |
| 151 int i ; | |
| 152 for ( i = 0 ; i < usedGeneId - baseGeneId ; ++i ) | |
| 153 transcriptId[i] = 0 ; | |
| 154 } | |
| 155 | |
| 156 bool TranscriptDecider::IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) | |
| 157 { | |
| 158 int j, k ; | |
| 159 int leftStrandCnt[2] = {0, 0}, rightStrandCnt[2] = {0, 0}; | |
| 160 for ( j = tag + 1 ; j < seCnt ; ++j ) | |
| 161 if ( subexons[j].start > subexons[j - 1].end + 1 ) | |
| 162 break ; | |
| 163 | |
| 164 for ( k = tag ; k < j ; ++k ) | |
| 165 { | |
| 166 if ( subexons[k].leftStrand != 0 ) | |
| 167 ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ; | |
| 168 if ( subexons[k].rightStrand != 0 ) | |
| 169 ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ; | |
| 170 } | |
| 171 | |
| 172 if ( rightStrandCnt[0] > 0 && leftStrandCnt[0] == 0 && leftStrandCnt[1] > 0 ) | |
| 173 return true ; | |
| 174 if ( rightStrandCnt[1] > 0 && leftStrandCnt[1] == 0 && leftStrandCnt[0] > 0 ) | |
| 175 return true ; | |
| 176 return false ; | |
| 177 } | |
| 178 | |
| 179 // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript | |
| 180 // the partial compatible case (return 2) mostly likely happen in DP where we have partial transcript. | |
| 181 int TranscriptDecider::IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) | |
| 182 { | |
| 183 //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ; | |
| 184 if ( c.first < transcript.first || c.first > transcript.last | |
| 185 || !transcript.seVector.Test( c.first ) | |
| 186 || ( !transcript.partial && !transcript.seVector.Test( c.last ) ) ) // no overlap or starts too early or some chosen subexons does not compatible | |
| 187 return 0 ; | |
| 188 | |
| 189 // Extract the subexons we should focus on. | |
| 190 int s, e ; | |
| 191 s = c.first ; | |
| 192 e = c.last ; | |
| 193 bool returnPartial = false ; | |
| 194 if ( e > transcript.last ) // constraints ends after the transcript. | |
| 195 { | |
| 196 if ( transcript.partial ) | |
| 197 { | |
| 198 e = transcript.last ; | |
| 199 returnPartial = true ; | |
| 200 } | |
| 201 else | |
| 202 return 0 ; | |
| 203 } | |
| 204 /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e, | |
| 205 transcript.seVector.Test(0), transcript.seVector.Test(1), | |
| 206 c.vector.Test(0), c.vector.Test(1) ) ;*/ | |
| 207 | |
| 208 compatibleTestVectorT.Assign( transcript.seVector ) ; | |
| 209 //compatibleTestVectorT.MaskRegionOutsideInRange( s, e, transcript.first, transcript.last ) ; | |
| 210 compatibleTestVectorT.MaskRegionOutside( s, e ) ; | |
| 211 | |
| 212 compatibleTestVectorC.Assign( c.vector ) ; | |
| 213 if ( c.last > transcript.last ) | |
| 214 { | |
| 215 //compatibleTestVectorC.MaskRegionOutsideInRange( s, e, c.first, c.last ) ; | |
| 216 //compatibleTestVectorC.MaskRegionOutside( s, e ) ; | |
| 217 compatibleTestVectorC.MaskRegionOutside( 0, e ) ; // Because the bits before s are already all 0s in C. | |
| 218 } | |
| 219 /*printf( "after masking %d %d. %d %d %d %d:\n", s, e, transcript.first, transcript.last, c.first, c.last ) ; | |
| 220 compatibleTestVectorT.Print() ; | |
| 221 compatibleTestVectorC.Print() ; */ | |
| 222 // Test compatible. | |
| 223 int ret = 0 ; | |
| 224 if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) ) | |
| 225 { | |
| 226 if ( returnPartial ) | |
| 227 ret = 2 ; | |
| 228 else | |
| 229 ret = 1 ; | |
| 230 } | |
| 231 | |
| 232 return ret ; | |
| 233 } | |
| 234 | |
| 235 int TranscriptDecider::IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) | |
| 236 { | |
| 237 //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ; | |
| 238 if ( c.first < transcript.first || c.first > transcript.last ) // no overlap or starts too early. | |
| 239 return 0 ; | |
| 240 printf( "hi\n" ) ; | |
| 241 // Extract the subexons we should focus on. | |
| 242 int s, e ; | |
| 243 s = c.first ; | |
| 244 e = c.last ; | |
| 245 bool returnPartial = false ; | |
| 246 if ( e > transcript.last ) // constraints ends after the transcript. | |
| 247 { | |
| 248 if ( transcript.partial ) | |
| 249 { | |
| 250 e = transcript.last ; | |
| 251 returnPartial = true ; | |
| 252 } | |
| 253 else | |
| 254 return 0 ; | |
| 255 } | |
| 256 /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e, | |
| 257 transcript.seVector.Test(0), transcript.seVector.Test(1), | |
| 258 c.vector.Test(0), c.vector.Test(1) ) ;*/ | |
| 259 | |
| 260 compatibleTestVectorT.Assign( transcript.seVector ) ; | |
| 261 compatibleTestVectorT.MaskRegionOutside( s, e ) ; | |
| 262 | |
| 263 compatibleTestVectorC.Assign( c.vector ) ; | |
| 264 if ( e > transcript.last ) | |
| 265 compatibleTestVectorC.MaskRegionOutside( s, e ) ; | |
| 266 /*printf( "after masking: (%d %d) (%d %d)\n", | |
| 267 compatibleTestVectorT.Test(0), compatibleTestVectorT.Test(1), | |
| 268 compatibleTestVectorC.Test(0), compatibleTestVectorC.Test(1) ) ;*/ | |
| 269 | |
| 270 // Test compatible. | |
| 271 int ret = 0 ; | |
| 272 if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) ) | |
| 273 { | |
| 274 if ( returnPartial ) | |
| 275 ret = 2 ; | |
| 276 else | |
| 277 ret = 1 ; | |
| 278 } | |
| 279 compatibleTestVectorT.Print() ; | |
| 280 compatibleTestVectorC.Print() ; | |
| 281 printf( "ret=%d\n", ret ) ; | |
| 282 return ret ; | |
| 283 } | |
| 284 int TranscriptDecider::SubTranscriptCount( int tag, struct _subexon *subexons, int *f ) | |
| 285 { | |
| 286 if ( f[tag] != -1 ) | |
| 287 return f[tag] ; | |
| 288 | |
| 289 int ret = 0 ; | |
| 290 int i ; | |
| 291 if ( subexons[tag].canBeEnd ) | |
| 292 ret = 1 ; | |
| 293 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) | |
| 294 { | |
| 295 ret += SubTranscriptCount( subexons[tag].next[i], subexons, f ) ; | |
| 296 } | |
| 297 | |
| 298 if ( ret == 0 ) | |
| 299 ret = 1 ; | |
| 300 return f[tag] = ret ; | |
| 301 } | |
| 302 | |
| 303 void TranscriptDecider::CoalesceSameTranscripts( std::vector<struct _transcript> &t ) | |
| 304 { | |
| 305 int i, k ; | |
| 306 if ( t.size() == 0 ) | |
| 307 return ; | |
| 308 | |
| 309 std::sort( t.begin(), t.end(), CompSortTranscripts ) ; | |
| 310 | |
| 311 int size = t.size() ; | |
| 312 k = 0 ; | |
| 313 for ( i = 1 ; i < size ; ++i ) | |
| 314 { | |
| 315 if ( t[k].seVector.IsEqual( t[i].seVector ) ) | |
| 316 { | |
| 317 t[k].abundance += t[i].abundance ; | |
| 318 t[i].seVector.Release() ; | |
| 319 } | |
| 320 else | |
| 321 { | |
| 322 ++k ; | |
| 323 if ( i != k ) | |
| 324 t[k] = t[i] ; | |
| 325 } | |
| 326 } | |
| 327 t.resize( k + 1 ) ; | |
| 328 } | |
| 329 | |
| 330 void TranscriptDecider::EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt ) | |
| 331 { | |
| 332 int i ; | |
| 333 visit[ vcnt ] = tag ; | |
| 334 //printf( "%s: %d %d %d %d. %d %d\n", __func__, vcnt, tag, subexons[tag].nextCnt, strand, subexons[tag].start, subexons[tag].end ) ; | |
| 335 // Compute the correlation score | |
| 336 double minCor = correlationScore ; | |
| 337 for ( i = 0 ; i < vcnt ; ++i ) | |
| 338 { | |
| 339 double tmp = correlation.Query( visit[i], visit[vcnt] ) ; | |
| 340 if ( tmp < minCor ) | |
| 341 minCor = tmp ; | |
| 342 } | |
| 343 | |
| 344 if ( subexons[tag].canBeEnd ) | |
| 345 { | |
| 346 struct _transcript &txpt = alltranscripts[atcnt] ; | |
| 347 for ( i = 0 ; i <= vcnt ; ++i ) | |
| 348 txpt.seVector.Set( visit[i] ) ; | |
| 349 | |
| 350 txpt.first = visit[0] ; | |
| 351 txpt.last = visit[vcnt] ; | |
| 352 txpt.partial = false ; | |
| 353 txpt.correlationScore = minCor ; | |
| 354 | |
| 355 //printf( "%lf %d %d ", txpt.correlationScore, vcnt, visit[0] ) ; | |
| 356 //txpt.seVector.Print() ; | |
| 357 ++atcnt ; | |
| 358 } | |
| 359 | |
| 360 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) | |
| 361 { | |
| 362 int a = subexons[tag].next[i] ; | |
| 363 if ( !SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand ) | |
| 364 && subexons[a].start > subexons[tag].end + 1 ) | |
| 365 continue ; | |
| 366 int backupStrand = strand ; | |
| 367 if ( subexons[a].start > subexons[tag].end + 1 && strand == 0 ) | |
| 368 strand = subexons[tag].rightStrand ; | |
| 369 EnumerateTranscript( subexons[tag].next[i], strand, visit, vcnt + 1, subexons, correlation, minCor, alltranscripts, atcnt ) ; | |
| 370 strand = backupStrand ; | |
| 371 } | |
| 372 } | |
| 373 | |
| 374 void TranscriptDecider::SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, | |
| 375 std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) | |
| 376 { | |
| 377 int i ; | |
| 378 int size ; | |
| 379 double cover ; | |
| 380 bool keepSearch = true ; | |
| 381 bool belowMin = false ; | |
| 382 | |
| 383 struct _subexon *subexons = attr.subexons ; | |
| 384 | |
| 385 visit[vcnt] = tag ; | |
| 386 ++vcnt ; | |
| 387 struct _dp visitdp ; | |
| 388 | |
| 389 visitdp.cover = -1 ; | |
| 390 | |
| 391 struct _transcript &subTxpt = attr.bufferTxpt ; | |
| 392 subTxpt.seVector.Reset() ; | |
| 393 for ( i = 0 ; i < pcnt ; ++i ) | |
| 394 subTxpt.seVector.Set( parents[i] ) ; | |
| 395 subTxpt.first = parents[0] ; | |
| 396 subTxpt.last = parents[ pcnt - 1] ; | |
| 397 for ( i = 0 ; i < vcnt ; ++i ) | |
| 398 subTxpt.seVector.Set( visit[i] ) ; | |
| 399 subTxpt.last = visit[ vcnt - 1 ] ; | |
| 400 subTxpt.partial = true ; | |
| 401 | |
| 402 // Adjust the extendsCnt | |
| 403 /*printf( "%s: %d %d %d\n", __func__, vcnt , extendCnt, extends[ extendCnt - 1] ) ; | |
| 404 subTxpt.seVector.Print() ; | |
| 405 tc[extends[extendCnt - 1]].vector.Print() ; | |
| 406 printf( "Adjust extend:\n") ;*/ | |
| 407 for ( i = extendCnt - 1 ; i >= 0 ; --i ) | |
| 408 { | |
| 409 if ( tc[ extends[i] ].last <= tag || ( tc[ extends[i] ].vector.Test( tag ) && IsConstraintInTranscript( subTxpt, tc[ extends[i] ] ) != 0 ) ) | |
| 410 break ; | |
| 411 } | |
| 412 extendCnt = i + 1 ; | |
| 413 | |
| 414 // If the extension ends. | |
| 415 subTxpt.partial = false ; | |
| 416 if ( subexons[tag].nextCnt > 0 && ( extendCnt == 0 || tag >= tc[ extends[ extendCnt - 1 ] ].last ) ) | |
| 417 { | |
| 418 // Solve the subtranscript beginning with visit. | |
| 419 // Now we got the optimal transcript for visit. | |
| 420 visitdp = SolveSubTranscript( visit, vcnt, strand, tc, tcStartInd, attr ) ; | |
| 421 keepSearch = false ; | |
| 422 } | |
| 423 //printf( "%s %d %d: visitdp.cover=%lf\n", __func__, parents[0], tag, visitdp.cover ) ; | |
| 424 | |
| 425 // the constraints across the parents and visit. | |
| 426 size = tc.size() ; | |
| 427 if ( visitdp.cover >= 0 ) | |
| 428 { | |
| 429 cover = visitdp.cover ; | |
| 430 // Reset the subTxpt, since its content is modofitied in SolveSubTxpt called above. | |
| 431 subTxpt.seVector.Reset() ; | |
| 432 for ( i = 0 ; i < pcnt ; ++i ) | |
| 433 subTxpt.seVector.Set( parents[i] ) ; | |
| 434 subTxpt.seVector.Or( visitdp.seVector ) ; | |
| 435 subTxpt.first = parents[0] ; | |
| 436 subTxpt.last = visitdp.last ; | |
| 437 subTxpt.partial = false ; | |
| 438 | |
| 439 if ( !attr.forAbundance && attr.minAbundance > 0 ) | |
| 440 { | |
| 441 for ( i = 0 ; i < pcnt - 1 ; ++i ) | |
| 442 { | |
| 443 if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() ) | |
| 444 belowMin = true ; | |
| 445 } | |
| 446 for ( i = -1 ; i < vcnt - 1 ; ++i ) | |
| 447 { | |
| 448 if ( i == -1 && pcnt >= 1 ) | |
| 449 { | |
| 450 if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() ) | |
| 451 belowMin = true ; | |
| 452 } | |
| 453 else | |
| 454 { | |
| 455 if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() ) | |
| 456 belowMin = true ; | |
| 457 } | |
| 458 } | |
| 459 if ( attr.forAbundance && belowMin ) | |
| 460 cover = 1e-6 ; | |
| 461 } | |
| 462 | |
| 463 for ( i = tcStartInd ; i < size ; ++i ) | |
| 464 { | |
| 465 if ( tc[i].first > parents[ pcnt - 1] ) | |
| 466 break ; | |
| 467 | |
| 468 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) | |
| 469 { | |
| 470 if ( tc[i].normAbund <= attr.minAbundance ) | |
| 471 { | |
| 472 belowMin = true ; | |
| 473 cover = -2 ; | |
| 474 break ; | |
| 475 } | |
| 476 | |
| 477 if ( tc[i].abundance <= 0 ) | |
| 478 continue ; | |
| 479 | |
| 480 if ( attr.forAbundance ) | |
| 481 { | |
| 482 if ( tc[i].normAbund < cover || cover == 0 ) | |
| 483 cover = tc[i].normAbund ; | |
| 484 } | |
| 485 else | |
| 486 { | |
| 487 ++cover ; | |
| 488 } | |
| 489 } | |
| 490 } | |
| 491 if ( belowMin && pdp.cover == -1 ) | |
| 492 { | |
| 493 pdp.cover = -2 ; | |
| 494 pdp.seVector.Assign( subTxpt.seVector ) ; | |
| 495 pdp.first = subTxpt.first ; | |
| 496 pdp.last = subTxpt.last ; | |
| 497 pdp.strand = strand ; | |
| 498 } | |
| 499 else if ( cover > pdp.cover ) | |
| 500 { | |
| 501 pdp.cover = cover ; | |
| 502 pdp.seVector.Assign( subTxpt.seVector ) ; | |
| 503 pdp.first = subTxpt.first ; | |
| 504 pdp.last = subTxpt.last ; | |
| 505 pdp.strand = strand ; | |
| 506 } | |
| 507 } | |
| 508 else if ( visitdp.cover == -2 && pdp.cover == -1 ) // no valid extension from visit | |
| 509 { | |
| 510 subTxpt.seVector.Reset() ; | |
| 511 for ( i = 0 ; i < pcnt ; ++i ) | |
| 512 subTxpt.seVector.Set( parents[i] ) ; | |
| 513 subTxpt.seVector.Or( visitdp.seVector ) ; | |
| 514 subTxpt.first = parents[0] ; | |
| 515 subTxpt.last = visitdp.last ; | |
| 516 | |
| 517 pdp.cover = -2 ; | |
| 518 pdp.seVector.Assign( subTxpt.seVector ) ; | |
| 519 pdp.first = subTxpt.first ; | |
| 520 pdp.last = subTxpt.last ; | |
| 521 pdp.strand = strand ; | |
| 522 } | |
| 523 | |
| 524 if ( subexons[tag].canBeEnd && ( visitdp.cover < 0 || attr.forAbundance ) ) | |
| 525 // This works is because that the extension always covers more constraints. So we only go this branch if the extension does not work | |
| 526 // and it goes this branch if it violates minAbundance | |
| 527 // But we need to go here when we want to compute the maxAbundance transcript. | |
| 528 // This part also works as the exit point of the recurive function. | |
| 529 { | |
| 530 bool belowMin = false ; | |
| 531 subTxpt.seVector.Reset() ; | |
| 532 for ( i = 0 ; i < pcnt ; ++i ) | |
| 533 subTxpt.seVector.Set( parents[i] ) ; | |
| 534 for ( i = 0 ; i < vcnt ; ++i ) | |
| 535 subTxpt.seVector.Set( visit[i] ) ; | |
| 536 subTxpt.first = parents[0] ; | |
| 537 subTxpt.last = visit[ vcnt - 1] ; | |
| 538 subTxpt.partial = false ; | |
| 539 | |
| 540 cover = 0 ; | |
| 541 if ( attr.forAbundance || attr.minAbundance > 0 ) | |
| 542 { | |
| 543 for ( i = 0 ; i < pcnt - 1 ; ++i ) | |
| 544 { | |
| 545 if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() ) | |
| 546 belowMin = true ; | |
| 547 } | |
| 548 for ( i = -1 ; i < vcnt - 1 ; ++i ) | |
| 549 { | |
| 550 if ( i == -1 && pcnt >= 1 ) | |
| 551 { | |
| 552 if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() ) | |
| 553 belowMin = true ; | |
| 554 } | |
| 555 else | |
| 556 { | |
| 557 if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() ) | |
| 558 belowMin = true ; | |
| 559 } | |
| 560 } | |
| 561 | |
| 562 //if ( belowMin == true ) | |
| 563 // printf( "turned belowMin. %d. %d %d: %d %d %d\n", attr.uncoveredPair.size(), pcnt, vcnt, parents[0], visit[0], visit[ vcnt - 1] ) ; | |
| 564 | |
| 565 if ( attr.forAbundance && belowMin ) | |
| 566 cover = 1e-6 ; | |
| 567 } | |
| 568 | |
| 569 for ( i = tcStartInd ; i < size ; ++i ) | |
| 570 { | |
| 571 // note that the value is parents[ pcnt - 1], because | |
| 572 // in above the part of "visit" is computed in SolveSubTranscript( visit ). | |
| 573 if ( tc[i].first > visit[ vcnt - 1] ) | |
| 574 break ; | |
| 575 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) | |
| 576 { | |
| 577 if ( tc[i].normAbund <= attr.minAbundance ) | |
| 578 { | |
| 579 belowMin = true ; | |
| 580 cover = -2 ; | |
| 581 break ; | |
| 582 } | |
| 583 | |
| 584 if ( tc[i].abundance <= 0 ) | |
| 585 continue ; | |
| 586 if ( attr.forAbundance ) | |
| 587 { | |
| 588 if ( tc[i].normAbund < cover || cover == 0 ) | |
| 589 cover = tc[i].normAbund ; | |
| 590 } | |
| 591 else | |
| 592 { | |
| 593 ++cover ; | |
| 594 } | |
| 595 } | |
| 596 } | |
| 597 | |
| 598 if ( belowMin && pdp.cover == -1 ) | |
| 599 { | |
| 600 pdp.cover = -2 ; | |
| 601 pdp.seVector.Assign( subTxpt.seVector ) ; | |
| 602 pdp.first = subTxpt.first ; | |
| 603 pdp.last = subTxpt.last ; | |
| 604 pdp.strand = strand ; | |
| 605 } | |
| 606 else if ( cover > pdp.cover ) | |
| 607 { | |
| 608 pdp.cover = cover ; | |
| 609 pdp.seVector.Assign( subTxpt.seVector ) ; | |
| 610 pdp.first = subTxpt.first ; | |
| 611 pdp.last = subTxpt.last ; | |
| 612 pdp.strand = strand ; | |
| 613 } | |
| 614 } | |
| 615 //printf( "%s %d: pdp.cover=%lf\n", __func__, tag, pdp.cover ) ; | |
| 616 | |
| 617 // keep searching. | |
| 618 if ( keepSearch ) | |
| 619 { | |
| 620 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) | |
| 621 { | |
| 622 int b = subexons[tag].next[i] ; | |
| 623 if ( ( SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand ) | |
| 624 && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) || | |
| 625 subexons[b].start == subexons[tag].end + 1 ) | |
| 626 { | |
| 627 int backupStrand = strand ; | |
| 628 if ( subexons[b].start > subexons[tag].end + 1 ) | |
| 629 strand = subexons[tag].rightStrand ; | |
| 630 | |
| 631 SearchSubTranscript( subexons[tag].next[i], strand, parents, pcnt, pdp, visit, vcnt, | |
| 632 extends, extendCnt, tc, tcStartInd, attr ) ; | |
| 633 strand = backupStrand ; | |
| 634 } | |
| 635 } | |
| 636 | |
| 637 } | |
| 638 | |
| 639 return ; | |
| 640 } | |
| 641 | |
| 642 struct _dp TranscriptDecider::SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) | |
| 643 { | |
| 644 int i ; | |
| 645 int size ; | |
| 646 /*printf( "%s: ", __func__ ) ; | |
| 647 for ( i = 0 ; i < vcnt ; ++i ) | |
| 648 printf( "%d ", visit[i] ) ; | |
| 649 printf( ": %lf %d %d", attr.f1[ visit[0] ].cover, attr.f1[ visit[0] ].timeStamp, attr.timeStamp ) ; | |
| 650 printf( "\n" ) ;*/ | |
| 651 // Test whether it is stored in dp | |
| 652 if ( vcnt == 1 ) | |
| 653 { | |
| 654 if ( attr.f1[ visit[0] ].cover != -1 && attr.f1[ visit[0] ].strand == strand && ( attr.f1[ visit[0] ].timeStamp == attr.timeStamp || | |
| 655 ( attr.f1[ visit[0] ].minAbundance < attr.minAbundance && attr.f1[visit[0]].cover == -2 ) ) ) //even given lower minAbundance threshold, it fails | |
| 656 { | |
| 657 return attr.f1[ visit[0] ] ; | |
| 658 } | |
| 659 } | |
| 660 else if ( vcnt == 2 && attr.f2 ) | |
| 661 { | |
| 662 int a = visit[0] ; | |
| 663 int b = visit[1] ; | |
| 664 | |
| 665 if ( attr.f2[a][b].cover != -1 && attr.f2[a][b].strand == strand && ( attr.f2[a][b].timeStamp == attr.timeStamp || | |
| 666 ( attr.f2[a][b].minAbundance < attr.minAbundance && attr.f2[a][b].cover == -2 ) ) ) | |
| 667 { | |
| 668 return attr.f2[a][b] ; | |
| 669 } | |
| 670 } | |
| 671 else | |
| 672 { | |
| 673 int key = 0 ; | |
| 674 for ( i = 0 ; i < vcnt ; ++i ) | |
| 675 key = ( key * attr.seCnt + visit[i] ) % hashMax ; | |
| 676 if ( key < 0 ) | |
| 677 key += hashMax ; | |
| 678 | |
| 679 if ( attr.hash[key].cover != -1 && attr.hash[key].cnt == vcnt && attr.hash[key].strand == strand && | |
| 680 ( attr.hash[key].first == visit[0] ) && | |
| 681 ( attr.hash[key].timeStamp == attr.timeStamp || | |
| 682 ( attr.hash[key].minAbundance < attr.minAbundance && attr.hash[key].cover == -2 ) ) ) | |
| 683 { | |
| 684 struct _transcript subTxpt = attr.bufferTxpt ; | |
| 685 subTxpt.seVector.Reset() ; | |
| 686 for ( i = 0 ; i < vcnt ; ++i ) | |
| 687 subTxpt.seVector.Set( visit[i] ) ; | |
| 688 //subTxpt.seVector.Print() ; | |
| 689 //attr.hash[key].seVector.Print() ; | |
| 690 subTxpt.seVector.Xor( attr.hash[key].seVector ) ; | |
| 691 subTxpt.seVector.MaskRegionOutside( visit[0], visit[ vcnt - 1] ) ; | |
| 692 //printf( "hash test: %d %d\n", key, subTxpt.seVector.IsAllZero() ) ; | |
| 693 if ( subTxpt.seVector.IsAllZero() ) | |
| 694 { | |
| 695 return attr.hash[key] ; | |
| 696 } | |
| 697 | |
| 698 // Can't use the code below, because vcnt is the header of subexons. | |
| 699 /*for ( i = 0 ; i < vcnt ; ++i ) | |
| 700 if ( !attr.hash[key].seVector.Test( visit[i] ) ) | |
| 701 break ; | |
| 702 if ( i >= vcnt ) | |
| 703 return attr.hash[key] ;*/ | |
| 704 | |
| 705 } | |
| 706 } | |
| 707 // adjust tcStartInd | |
| 708 size = tc.size() ; | |
| 709 for ( i = tcStartInd ; i < size ; ++i ) | |
| 710 if ( tc[i].first >= visit[0] ) | |
| 711 break ; | |
| 712 tcStartInd = i ; | |
| 713 | |
| 714 | |
| 715 struct _subexon *subexons = attr.subexons ; | |
| 716 struct _dp visitdp ; | |
| 717 visitdp.seVector.Init( attr.seCnt ) ; | |
| 718 visitdp.cover = -1 ; | |
| 719 | |
| 720 struct _transcript &subTxpt = attr.bufferTxpt ; | |
| 721 // This happens when it is called from PickTranscriptsByDP, the first subexon might be the end. | |
| 722 subTxpt.seVector.Reset() ; | |
| 723 for ( i = 0 ; i < vcnt ; ++i ) | |
| 724 subTxpt.seVector.Set( visit[i] ) ; | |
| 725 subTxpt.first = visit[0] ; | |
| 726 subTxpt.last = visit[vcnt - 1] ; | |
| 727 | |
| 728 if ( subexons[ visit[vcnt - 1] ].canBeEnd ) | |
| 729 { | |
| 730 subTxpt.partial = false ; | |
| 731 double cover = 0 ; | |
| 732 for ( i = tcStartInd ; i < size ; ++i ) | |
| 733 { | |
| 734 if ( tc[i].first > subTxpt.last ) | |
| 735 break ; | |
| 736 | |
| 737 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) | |
| 738 { | |
| 739 if ( tc[i].normAbund <= attr.minAbundance ) | |
| 740 { | |
| 741 cover = -2 ; | |
| 742 break ; | |
| 743 } | |
| 744 | |
| 745 if ( tc[i].abundance <= 0 ) | |
| 746 continue ; | |
| 747 if ( attr.forAbundance ) | |
| 748 { | |
| 749 if ( tc[i].normAbund < cover || cover == 0 ) | |
| 750 cover = tc[i].normAbund ; | |
| 751 } | |
| 752 else | |
| 753 ++cover ; | |
| 754 } | |
| 755 } | |
| 756 | |
| 757 visitdp.seVector.Assign( subTxpt.seVector ) ; | |
| 758 visitdp.cover = cover ; | |
| 759 visitdp.first = subTxpt.first ; | |
| 760 visitdp.last = subTxpt.last ; | |
| 761 visitdp.strand = strand ; | |
| 762 } | |
| 763 | |
| 764 // Now we extend. | |
| 765 size = tc.size() ; | |
| 766 int *extends = new int[tc.size() - tcStartInd + 1] ; | |
| 767 int extendCnt = 0 ; | |
| 768 subTxpt.partial = true ; | |
| 769 for ( i = tcStartInd ; i < size ; ++i ) | |
| 770 { | |
| 771 if ( tc[i].first > subTxpt.last ) | |
| 772 break ; | |
| 773 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 2 ) | |
| 774 { | |
| 775 extends[extendCnt] = i ; | |
| 776 ++extendCnt ; | |
| 777 } | |
| 778 } | |
| 779 | |
| 780 // Sort the extend by the index of the last subexon. | |
| 781 if ( extendCnt > 0 ) | |
| 782 { | |
| 783 struct _pair32 *extendsPairs = new struct _pair32[extendCnt] ; | |
| 784 | |
| 785 for ( i = 0 ; i < extendCnt ; ++i ) | |
| 786 { | |
| 787 extendsPairs[i].a = extends[i] ; | |
| 788 extendsPairs[i].b = tc[ extends[i] ].last ; | |
| 789 } | |
| 790 qsort( extendsPairs, extendCnt, sizeof( struct _pair32 ), CompPairsByB ) ; | |
| 791 | |
| 792 for ( i = 0 ; i < extendCnt ; ++i ) | |
| 793 extends[i] = extendsPairs[i].a ; | |
| 794 | |
| 795 delete[] extendsPairs ; | |
| 796 } | |
| 797 | |
| 798 size = subexons[ visit[vcnt - 1] ].nextCnt ; | |
| 799 int nextvCnt = 1 ; | |
| 800 if ( extendCnt > 0 && tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] > 1 ) | |
| 801 nextvCnt = tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] ; | |
| 802 int *nextv = new int[ nextvCnt ] ; | |
| 803 for ( i = 0 ; i < size ; ++i ) | |
| 804 { | |
| 805 int a = visit[vcnt - 1] ; | |
| 806 int b = subexons[a].next[i] ; | |
| 807 if ( ( SubexonGraph::IsSameStrand( subexons[a].rightStrand, strand ) | |
| 808 && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) | |
| 809 || | |
| 810 subexons[b].start == subexons[a].end + 1 ) | |
| 811 { | |
| 812 int backupStrand = strand ; | |
| 813 if ( subexons[b].start > subexons[a].end + 1 ) | |
| 814 strand = subexons[a].rightStrand ; | |
| 815 SearchSubTranscript( subexons[ visit[vcnt - 1] ].next[i], strand, visit, vcnt, visitdp, nextv, 0, extends, extendCnt, tc, tcStartInd, attr ) ; | |
| 816 strand = backupStrand ; | |
| 817 | |
| 818 } | |
| 819 } | |
| 820 //printf( "%s %d(%d) %d %d %d: %lf\n", __func__, visit[0], subexons[ visit[vcnt - 1] ].canBeEnd, size, extendCnt, strand, visitdp.cover ) ; | |
| 821 delete[] nextv ; | |
| 822 delete[] extends ; | |
| 823 | |
| 824 // store the result in the dp structure. | |
| 825 // We return the structure stored in dp to simplify the memory access pattern. | |
| 826 // In other words, we assume the structure returned from this function always uses the memory from attr.dp | |
| 827 if ( vcnt == 1 ) | |
| 828 { | |
| 829 SetDpContent( attr.f1[ visit[0] ], visitdp, attr ) ; | |
| 830 visitdp.seVector.Release() ; | |
| 831 return attr.f1[ visit[0] ] ; | |
| 832 } | |
| 833 else if ( vcnt == 2 && attr.f2 ) | |
| 834 { | |
| 835 SetDpContent( attr.f2[ visit[0] ][ visit[1] ], visitdp, attr ) ; | |
| 836 visitdp.seVector.Release() ; | |
| 837 return attr.f2[ visit[0] ][ visit[1] ] ; | |
| 838 } | |
| 839 else | |
| 840 { | |
| 841 int key = 0 ; | |
| 842 for ( i = 0 ; i < vcnt ; ++i ) | |
| 843 key = ( key * attr.seCnt + visit[i] ) % hashMax ; | |
| 844 if ( key < 0 ) | |
| 845 key += hashMax ; | |
| 846 | |
| 847 //static int hashUsed = 0 ; | |
| 848 //if ( attr.hash[key].cover == -1 ) | |
| 849 // ++hashUsed ; | |
| 850 //printf( "%d/%d\n", hashUsed, HASH_MAX) ; | |
| 851 //printf( "hash write: %d\n", key ) ; | |
| 852 SetDpContent( attr.hash[key], visitdp, attr ) ; | |
| 853 attr.hash[key].cnt = vcnt ; | |
| 854 visitdp.seVector.Release() ; | |
| 855 return attr.hash[key] ; | |
| 856 } | |
| 857 } | |
| 858 | |
| 859 | |
| 860 void TranscriptDecider::PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &alltranscripts ) | |
| 861 { | |
| 862 int i, j, k ; | |
| 863 | |
| 864 std::vector<struct _transcript> transcripts ; | |
| 865 std::vector<struct _constraint> &tc = constraints.constraints ; | |
| 866 int tcCnt = tc.size() ; | |
| 867 int coalesceThreshold = 1024 ; | |
| 868 | |
| 869 //printf( "tcCnt=%d\n", tcCnt ) ; | |
| 870 | |
| 871 attr.timeStamp = 1 ; | |
| 872 attr.bufferTxpt.seVector.Init( seCnt ) ; | |
| 873 attr.subexons = subexons ; | |
| 874 attr.seCnt = seCnt ; | |
| 875 | |
| 876 double maxAbundance = -1 ; | |
| 877 // Initialize the dp data structure | |
| 878 /*memset( attr.f1, -1, sizeof( struct _dp ) * seCnt ) ; | |
| 879 for ( i = 0 ; i < seCnt ; ++i ) | |
| 880 memset( attr.f2[i], -1, sizeof( struct _dp ) * seCnt ) ; | |
| 881 memset( attr.hash, -1, sizeof( struct _dp ) * HASH_MAX ) ;*/ | |
| 882 for ( i = 0 ; i < seCnt ; ++i ) | |
| 883 ResetDpContent( attr.f1[i] ) ; | |
| 884 for ( i = 0 ; i < seCnt && attr.f2 ; ++i ) | |
| 885 for ( j = i ; j < seCnt ; ++j ) | |
| 886 ResetDpContent( attr.f2[i][j] ) ; | |
| 887 for ( i = 0 ; i < hashMax ; ++i ) | |
| 888 ResetDpContent( attr.hash[i] ) ; | |
| 889 | |
| 890 // Set the uncovered pair | |
| 891 attr.uncoveredPair.clear() ; | |
| 892 BitTable bufferTable( seCnt ) ; | |
| 893 k = 0 ; | |
| 894 for ( i = 0 ; i < seCnt ; ++i ) | |
| 895 { | |
| 896 for ( ; k < tcCnt ; ++k ) | |
| 897 { | |
| 898 if ( tc[k].last >= i ) | |
| 899 break ; | |
| 900 } | |
| 901 | |
| 902 if ( k >= tcCnt || tc[k].first > i ) | |
| 903 { | |
| 904 for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) | |
| 905 { | |
| 906 attr.uncoveredPair[i * seCnt + subexons[i].next[j] ] = 1 ; | |
| 907 } | |
| 908 continue ; | |
| 909 } | |
| 910 | |
| 911 for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) | |
| 912 { | |
| 913 bool covered = false ; | |
| 914 int l, n ; | |
| 915 | |
| 916 n = subexons[i].next[j] ; | |
| 917 for ( l = k ; l < tcCnt ; ++l ) | |
| 918 { | |
| 919 if ( tc[l].first > i ) | |
| 920 break ; | |
| 921 if ( tc[l].vector.Test( i ) && tc[l].vector.Test( n ) ) | |
| 922 { | |
| 923 if ( n == i + 1 ) | |
| 924 { | |
| 925 covered = true ; | |
| 926 break ; | |
| 927 } | |
| 928 else | |
| 929 { | |
| 930 bufferTable.Assign( tc[l].vector ) ; | |
| 931 bufferTable.MaskRegionOutside( i + 1, n - 1 ) ; | |
| 932 if ( bufferTable.IsAllZero() ) | |
| 933 { | |
| 934 covered = true ; | |
| 935 break ; | |
| 936 } | |
| 937 } | |
| 938 } | |
| 939 } | |
| 940 | |
| 941 if ( !covered ) | |
| 942 { | |
| 943 //printf( "set!: (%d: %d %d) (%d: %d %d)\n", i, subexons[i].start, subexons[i].end, n, subexons[n].start, subexons[n].end ) ; | |
| 944 attr.uncoveredPair[ i * seCnt + n ] = 1 ; | |
| 945 } | |
| 946 } | |
| 947 } | |
| 948 bufferTable.Release() ; | |
| 949 | |
| 950 | |
| 951 // Find the max abundance | |
| 952 attr.forAbundance = true ; | |
| 953 attr.minAbundance = 0 ; | |
| 954 for ( i = 0 ; i < seCnt ; ++i ) | |
| 955 { | |
| 956 if ( subexons[i].canBeStart ) | |
| 957 { | |
| 958 int visit[1] = {i} ; | |
| 959 struct _dp tmp ; | |
| 960 | |
| 961 tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ; | |
| 962 | |
| 963 if ( tmp.cover > maxAbundance ) | |
| 964 maxAbundance = tmp.cover ; | |
| 965 } | |
| 966 } | |
| 967 //PrintLog( "maxAbundance=%lf", maxAbundance ) ; | |
| 968 //exit( 1 ) ; | |
| 969 | |
| 970 // Pick the transcripts. Quantative Set-Cover | |
| 971 // Notice that by the logic in SearchSubTxpt and SolveSubTxpt, we don't need to reinitialize the data structure. | |
| 972 attr.forAbundance = false ; | |
| 973 int *coveredTc = new int[tcCnt] ; | |
| 974 int coveredTcCnt ; | |
| 975 struct _dp maxCoverDp ; | |
| 976 struct _dp bestDp ; | |
| 977 std::map<double, struct _dp> cachedCoverResult ; | |
| 978 | |
| 979 maxCoverDp.seVector.Init( seCnt ) ; | |
| 980 bestDp.seVector.Init( seCnt ) ; | |
| 981 int iterCnt = 0 ; | |
| 982 | |
| 983 while ( 1 ) | |
| 984 { | |
| 985 double bestScore ; | |
| 986 | |
| 987 // iterately assign constraints | |
| 988 attr.minAbundance = 0 ; | |
| 989 | |
| 990 // Find the best candidate transcript. | |
| 991 bestDp.cover = -1 ; | |
| 992 bestScore = -1 ; | |
| 993 while ( 1 ) | |
| 994 { | |
| 995 // iterate the change of minAbundance | |
| 996 if ( cachedCoverResult.find( attr.minAbundance ) != cachedCoverResult.end() ) | |
| 997 { | |
| 998 struct _dp tmp = cachedCoverResult[ attr.minAbundance ] ; | |
| 999 SetDpContent( maxCoverDp, tmp, attr ) ; | |
| 1000 } | |
| 1001 else | |
| 1002 { | |
| 1003 maxCoverDp.cover = -1 ; | |
| 1004 ++attr.timeStamp ; | |
| 1005 for ( i = 0 ; i < seCnt ; ++i ) | |
| 1006 { | |
| 1007 if ( subexons[i].canBeStart == false ) | |
| 1008 continue ; | |
| 1009 int visit[1] = {i} ; | |
| 1010 struct _dp tmp ; | |
| 1011 tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ; | |
| 1012 | |
| 1013 if ( tmp.cover > maxCoverDp.cover && tmp.cover > 0 ) | |
| 1014 { | |
| 1015 SetDpContent( maxCoverDp, tmp, attr ) ; | |
| 1016 } | |
| 1017 //if ( subexons[i].start == 6870264 || subexons[i].start == 6872237 ) | |
| 1018 // printf( "%d: %lf\n", i, tmp.cover ) ; | |
| 1019 } | |
| 1020 | |
| 1021 if ( maxCoverDp.cover == -1 ) | |
| 1022 break ; | |
| 1023 struct _dp ccr ; | |
| 1024 ccr.seVector.Init( seCnt ) ; | |
| 1025 SetDpContent( ccr, maxCoverDp, attr ) ; | |
| 1026 cachedCoverResult[ attr.minAbundance ] = ccr ; | |
| 1027 } | |
| 1028 // the abundance for the max cover txpt. | |
| 1029 double min = -1 ; | |
| 1030 struct _transcript &subTxpt = attr.bufferTxpt ; | |
| 1031 subTxpt.seVector.Assign( maxCoverDp.seVector ) ; | |
| 1032 subTxpt.first = maxCoverDp.first ; | |
| 1033 subTxpt.last = maxCoverDp.last ; | |
| 1034 | |
| 1035 for ( i = 0 ; i < tcCnt ; ++i ) | |
| 1036 { | |
| 1037 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) | |
| 1038 { | |
| 1039 if ( tc[i].normAbund < min || min == -1 ) | |
| 1040 min = tc[i].normAbund ; | |
| 1041 } | |
| 1042 } | |
| 1043 | |
| 1044 if ( attr.minAbundance == 0 ) | |
| 1045 { | |
| 1046 std::vector<int> subexonIdx ; | |
| 1047 maxCoverDp.seVector.GetOnesIndices( subexonIdx ) ; | |
| 1048 int size = subexonIdx.size() ; | |
| 1049 for ( i = 0 ; i < size - 1 ; ++i ) | |
| 1050 if ( attr.uncoveredPair.find( subexonIdx[i] * seCnt + subexonIdx[i + 1] ) != attr.uncoveredPair.end() ) | |
| 1051 { | |
| 1052 min = 1e-6 ; | |
| 1053 break ; | |
| 1054 } | |
| 1055 } | |
| 1056 | |
| 1057 double score = ComputeScore( maxCoverDp.cover, 1.0, min, maxAbundance, 0 ) ; | |
| 1058 if ( bestScore == -1 || score > bestScore ) | |
| 1059 { | |
| 1060 bestScore = score ; | |
| 1061 SetDpContent( bestDp, maxCoverDp, attr ) ; | |
| 1062 } | |
| 1063 else if ( score < bestScore ) | |
| 1064 { | |
| 1065 if ( ComputeScore( maxCoverDp.cover, 1.0, maxAbundance, maxAbundance, 0 ) < bestScore ) | |
| 1066 break ; | |
| 1067 } | |
| 1068 //PrintLog( "normAbund=%lf maxCoverDp.cover=%lf score=%lf timeStamp=%d", min, maxCoverDp.cover, score, attr.timeStamp ) ; | |
| 1069 attr.minAbundance = min ; | |
| 1070 } // end of iteration for minAbundance. | |
| 1071 | |
| 1072 if ( bestDp.cover == -1 ) | |
| 1073 break ; | |
| 1074 // Assign the constraints. | |
| 1075 coveredTcCnt = 0 ; | |
| 1076 double update = -1 ; | |
| 1077 struct _transcript &subTxpt = attr.bufferTxpt ; | |
| 1078 subTxpt.seVector.Assign( bestDp.seVector ) ; | |
| 1079 subTxpt.first = bestDp.first ; | |
| 1080 subTxpt.last = bestDp.last ; | |
| 1081 subTxpt.partial = false ; | |
| 1082 for ( i = 0 ; i < tcCnt ; ++i ) | |
| 1083 { | |
| 1084 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) | |
| 1085 { | |
| 1086 if ( tc[i].abundance > 0 && | |
| 1087 ( tc[i].abundance < update || update == -1 ) ) | |
| 1088 { | |
| 1089 update = tc[i].abundance ; | |
| 1090 } | |
| 1091 coveredTc[ coveredTcCnt ] = i ; | |
| 1092 ++coveredTcCnt ; | |
| 1093 } | |
| 1094 /*else | |
| 1095 { | |
| 1096 printf( "%d: ", i ) ; | |
| 1097 tc[i].vector.Print() ; | |
| 1098 if ( i == 127 ) | |
| 1099 { | |
| 1100 printf( "begin debug:\n" ) ; | |
| 1101 IsConstraintInTranscriptDebug( subTxpt, tc[i] ) ; | |
| 1102 } | |
| 1103 }*/ | |
| 1104 } | |
| 1105 update *= ( 1 + iterCnt / 50 ) ;//* ( 1 + iterCnt / 50 ) ; | |
| 1106 | |
| 1107 //PrintLog( "%d: update=%lf %d %d. %d %d %d", iterCnt, update, coveredTcCnt, tcCnt, | |
| 1108 // bestDp.first, bestDp.last, subexons[ bestDp.first ].start ) ; | |
| 1109 //bestDp.seVector.Print() ; | |
| 1110 | |
| 1111 struct _transcript nt ; | |
| 1112 nt.seVector.Duplicate( bestDp.seVector ) ; | |
| 1113 nt.first = bestDp.first ; | |
| 1114 nt.last = bestDp.last ; | |
| 1115 nt.partial = false ; | |
| 1116 nt.abundance = 0 ; | |
| 1117 for ( i = 0 ; i < coveredTcCnt ; ++i ) | |
| 1118 { | |
| 1119 j = coveredTc[i] ; | |
| 1120 if ( tc[j].abundance > 0 ) | |
| 1121 { | |
| 1122 double tmp = ( tc[j].abundance > update ? update : tc[j].abundance ) ; | |
| 1123 tc[j].abundance -= tmp ; | |
| 1124 double factor = 1 ; | |
| 1125 | |
| 1126 nt.abundance += ( tc[j].support * update / tc[j].normAbund * factor ) ; | |
| 1127 | |
| 1128 if ( tc[j].abundance <= 0 ) | |
| 1129 { | |
| 1130 std::vector<double> removeKey ; | |
| 1131 for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it ) | |
| 1132 { | |
| 1133 subTxpt.seVector.Assign( it->second.seVector ) ; | |
| 1134 subTxpt.first = it->second.first ; | |
| 1135 subTxpt.last = it->second.last ; | |
| 1136 subTxpt.partial = false ; | |
| 1137 if ( IsConstraintInTranscript( subTxpt, tc[j] ) == 1 ) | |
| 1138 { | |
| 1139 it->second.seVector.Release() ; | |
| 1140 removeKey.push_back( it->first ) ; | |
| 1141 } | |
| 1142 } | |
| 1143 int size = removeKey.size() ; | |
| 1144 int l ; | |
| 1145 for ( l = 0 ; l < size ; ++l ) | |
| 1146 cachedCoverResult.erase( removeKey[l] ) ; | |
| 1147 } | |
| 1148 } | |
| 1149 | |
| 1150 if ( tc[j].abundance < 0 ) | |
| 1151 tc[j].abundance = 0 ; | |
| 1152 } | |
| 1153 | |
| 1154 transcripts.push_back( nt ) ; | |
| 1155 if ( transcripts.size() >= transcripts.capacity() && (int)transcripts.size() >= coalesceThreshold ) | |
| 1156 { | |
| 1157 CoalesceSameTranscripts( transcripts ) ; | |
| 1158 if ( transcripts.size() >= transcripts.capacity() / 2 ) | |
| 1159 coalesceThreshold *= 2 ; | |
| 1160 } | |
| 1161 ++iterCnt ; | |
| 1162 | |
| 1163 if ( iterCnt >= iterBound ) | |
| 1164 break ; | |
| 1165 } | |
| 1166 CoalesceSameTranscripts( transcripts ) ; | |
| 1167 int size = transcripts.size() ; | |
| 1168 // Compute the correlation score | |
| 1169 for ( i = 0 ; i < size ; ++i ) | |
| 1170 { | |
| 1171 std::vector<int> subexonInd ; | |
| 1172 transcripts[i].seVector.GetOnesIndices( subexonInd ) ; | |
| 1173 double cor = 2.0 ; | |
| 1174 int s = subexonInd.size() ; | |
| 1175 for ( j = 0 ; j < s ; ++j ) | |
| 1176 for ( k = j + 1 ; k < s ; ++k ) | |
| 1177 { | |
| 1178 double tmp = correlation.Query( subexonInd[j], subexonInd[k] ) ; | |
| 1179 if ( tmp < cor ) | |
| 1180 cor = tmp ; | |
| 1181 } | |
| 1182 if ( cor > 1 ) | |
| 1183 cor = 0 ; | |
| 1184 transcripts[i].correlationScore = cor ; | |
| 1185 } | |
| 1186 | |
| 1187 // store the result | |
| 1188 for ( i = 0 ; i < size ; ++i ) | |
| 1189 alltranscripts.push_back( transcripts[i] ) ; | |
| 1190 | |
| 1191 // Release the memory | |
| 1192 for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it ) | |
| 1193 { | |
| 1194 it->second.seVector.Release() ; | |
| 1195 } | |
| 1196 attr.bufferTxpt.seVector.Release() ; | |
| 1197 | |
| 1198 delete[] coveredTc ; | |
| 1199 maxCoverDp.seVector.Release() ; | |
| 1200 bestDp.seVector.Release() ; | |
| 1201 } | |
| 1202 | |
| 1203 | |
| 1204 // Add the preifx/suffix of transcripts to the list | |
| 1205 void TranscriptDecider::AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend ) | |
| 1206 { | |
| 1207 int i, j, k ; | |
| 1208 int size = alltranscripts.size() ; | |
| 1209 if ( size >= limit ) | |
| 1210 return ; | |
| 1211 | |
| 1212 // Augment suffix, prefix transcripts | |
| 1213 for ( i = 0 ; i < size ; ++i ) | |
| 1214 { | |
| 1215 std::vector<int> subexonIdx ; | |
| 1216 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; | |
| 1217 int seIdxCnt = subexonIdx.size() ; | |
| 1218 // suffix | |
| 1219 for ( j = 1 ; j < seIdxCnt ; ++j ) | |
| 1220 { | |
| 1221 if ( subexons[ subexonIdx[j] ].canBeStart ) | |
| 1222 { | |
| 1223 struct _transcript nt ; | |
| 1224 nt.first = subexonIdx[j] ; | |
| 1225 nt.last = alltranscripts[i].last ; | |
| 1226 nt.seVector.Duplicate( alltranscripts[i].seVector ) ; | |
| 1227 nt.seVector.MaskRegionOutside( nt.first, nt.last ) ; | |
| 1228 nt.partial = false ; | |
| 1229 nt.correlationScore = 0 ; | |
| 1230 nt.abundance = 0 ; | |
| 1231 nt.constraintsSupport = NULL ; | |
| 1232 | |
| 1233 alltranscripts.push_back( nt ) ; | |
| 1234 if ( alltranscripts.size() >= limit ) | |
| 1235 return ; | |
| 1236 } | |
| 1237 } | |
| 1238 | |
| 1239 // prefix | |
| 1240 for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) | |
| 1241 { | |
| 1242 if ( subexons[ subexonIdx[j] ].canBeEnd ) | |
| 1243 { | |
| 1244 struct _transcript nt ; | |
| 1245 nt.first = alltranscripts[i].first ; | |
| 1246 nt.last = subexonIdx[j] ; | |
| 1247 nt.seVector.Duplicate( alltranscripts[i].seVector ) ; | |
| 1248 nt.seVector.MaskRegionOutside( nt.first, nt.last ) ; | |
| 1249 nt.partial = false ; | |
| 1250 nt.correlationScore = 0 ; | |
| 1251 nt.abundance = 0 ; | |
| 1252 nt.constraintsSupport = NULL ; | |
| 1253 | |
| 1254 alltranscripts.push_back( nt ) ; | |
| 1255 if ( alltranscripts.size() >= limit ) | |
| 1256 return ; | |
| 1257 } | |
| 1258 } | |
| 1259 | |
| 1260 if ( extend ) | |
| 1261 { | |
| 1262 //Extentions right. | |
| 1263 for ( j = 0 ; j < seIdxCnt ; ++j ) | |
| 1264 { | |
| 1265 if ( subexons[ subexonIdx[j] ].nextCnt > 1 ) | |
| 1266 { | |
| 1267 for ( k = 0 ; k < subexons[ subexonIdx[j] ].nextCnt ; ++k ) | |
| 1268 { | |
| 1269 int idx = subexons[ subexonIdx[j] ].next[k] ; | |
| 1270 | |
| 1271 if ( alltranscripts[i].seVector.Test( idx ) ) | |
| 1272 continue ; | |
| 1273 int l ; | |
| 1274 std::vector<int> visited ; | |
| 1275 while ( 1 ) | |
| 1276 { | |
| 1277 if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 ) | |
| 1278 { | |
| 1279 break ; | |
| 1280 } | |
| 1281 | |
| 1282 visited.push_back( idx ) ; | |
| 1283 if ( subexons[idx].canBeEnd && subexons[idx].nextCnt == 0 ) | |
| 1284 { | |
| 1285 struct _transcript nt ; | |
| 1286 nt.first = alltranscripts[i].first ; | |
| 1287 nt.last = idx ; | |
| 1288 nt.seVector.Duplicate( alltranscripts[i].seVector ) ; | |
| 1289 nt.seVector.MaskRegionOutside( nt.first, subexonIdx[j] ) ; | |
| 1290 int visitedSize = visited.size() ; | |
| 1291 for ( l = 0 ; l < visitedSize ; ++l ) | |
| 1292 nt.seVector.Set( visited[l] ) ; | |
| 1293 nt.partial = false ; | |
| 1294 nt.correlationScore = 0 ; | |
| 1295 nt.abundance = 0 ; | |
| 1296 nt.constraintsSupport = NULL ; | |
| 1297 | |
| 1298 alltranscripts.push_back( nt ) ; | |
| 1299 if ( alltranscripts.size() >= limit ) | |
| 1300 return ; | |
| 1301 } | |
| 1302 | |
| 1303 if ( subexons[idx].nextCnt == 1 ) | |
| 1304 idx = subexons[idx].next[0] ; | |
| 1305 else | |
| 1306 break ; | |
| 1307 } | |
| 1308 } | |
| 1309 } | |
| 1310 } | |
| 1311 | |
| 1312 // Extension towards left | |
| 1313 for ( j = 0 ; j < seIdxCnt ; ++j ) | |
| 1314 { | |
| 1315 if ( subexons[ subexonIdx[j] ].prevCnt > 1 ) | |
| 1316 { | |
| 1317 for ( k = 0 ; k < subexons[ subexonIdx[j] ].prevCnt ; ++k ) | |
| 1318 { | |
| 1319 int idx = subexons[ subexonIdx[j] ].prev[k] ; | |
| 1320 | |
| 1321 if ( alltranscripts[i].seVector.Test( idx ) ) | |
| 1322 continue ; | |
| 1323 int l ; | |
| 1324 std::vector<int> visited ; | |
| 1325 while ( 1 ) | |
| 1326 { | |
| 1327 if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 ) | |
| 1328 { | |
| 1329 break ; | |
| 1330 } | |
| 1331 | |
| 1332 visited.push_back( idx ) ; | |
| 1333 if ( subexons[idx].canBeStart && subexons[idx].prevCnt == 0 ) | |
| 1334 { | |
| 1335 struct _transcript nt ; | |
| 1336 nt.first = idx ; | |
| 1337 nt.last = alltranscripts[i].last ; | |
| 1338 nt.seVector.Duplicate( alltranscripts[i].seVector ) ; | |
| 1339 nt.seVector.MaskRegionOutside( subexonIdx[j], nt.last ) ; | |
| 1340 int visitedSize = visited.size() ; | |
| 1341 for ( l = 0 ; l < visitedSize ; ++l ) | |
| 1342 nt.seVector.Set( visited[l] ) ; | |
| 1343 nt.partial = false ; | |
| 1344 nt.correlationScore = 0 ; | |
| 1345 nt.abundance = 0 ; | |
| 1346 nt.constraintsSupport = NULL ; | |
| 1347 | |
| 1348 alltranscripts.push_back( nt ) ; | |
| 1349 if ( alltranscripts.size() >= limit ) | |
| 1350 return ; | |
| 1351 } | |
| 1352 | |
| 1353 if ( subexons[idx].prevCnt == 1 ) | |
| 1354 idx = subexons[idx].prev[0] ; | |
| 1355 else | |
| 1356 break ; | |
| 1357 } | |
| 1358 } | |
| 1359 } | |
| 1360 } | |
| 1361 } // for if-extend | |
| 1362 } | |
| 1363 | |
| 1364 CoalesceSameTranscripts( alltranscripts ) ; | |
| 1365 } | |
| 1366 | |
| 1367 // Pick the transcripts from given transcripts. | |
| 1368 void TranscriptDecider::PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints, | |
| 1369 SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts ) | |
| 1370 { | |
| 1371 int i, j, k ; | |
| 1372 std::vector<int> chosen ; | |
| 1373 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ; | |
| 1374 int atcnt = alltranscripts.size() ; | |
| 1375 int tcCnt = tc.size() ; // transcript constraints | |
| 1376 int seCnt = 0 ; | |
| 1377 | |
| 1378 if ( tcCnt == 0 ) | |
| 1379 return ; | |
| 1380 if ( atcnt > 0 ) | |
| 1381 seCnt = alltranscripts[0].seVector.GetSize() ; | |
| 1382 else | |
| 1383 return ; | |
| 1384 | |
| 1385 double inf = -1 ; // infinity | |
| 1386 int coalesceThreshold = 1024 ; | |
| 1387 int *transcriptSeCnt = new int[ atcnt ] ; | |
| 1388 int *transcriptLength = new int[atcnt] ; | |
| 1389 double *transcriptAbundance = new double[atcnt] ; // the roughly estimated abundance based on constraints. | |
| 1390 double *avgTranscriptAbundance = new double[atcnt] ; // the average normAbund from the compatible constraints. | |
| 1391 | |
| 1392 BitTable *btable = new BitTable[ atcnt ] ; | |
| 1393 //BitTable lowCovSubexon ; // force the abundance to 0 for the transcript contains the subexon. | |
| 1394 double *coveredPortion = new double[atcnt] ; | |
| 1395 | |
| 1396 memset( avgTranscriptAbundance, 0 ,sizeof( double ) * atcnt ) ; | |
| 1397 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1398 btable[i].Init( tcCnt ) ; | |
| 1399 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 1400 { | |
| 1401 int a = constraints.matePairs[j].i ; | |
| 1402 int b = constraints.matePairs[j].j ; | |
| 1403 | |
| 1404 if ( constraints.constraints[a].support > inf ) | |
| 1405 inf = constraints.constraints[a].support ; | |
| 1406 if ( constraints.constraints[b].support > inf ) | |
| 1407 inf = constraints.constraints[b].support ; | |
| 1408 | |
| 1409 if ( tc[j].normAbund > inf ) | |
| 1410 inf = tc[j].normAbund ; | |
| 1411 | |
| 1412 tc[j].abundance = tc[j].normAbund ; | |
| 1413 } | |
| 1414 ++inf ; | |
| 1415 bool btableSet = false ; | |
| 1416 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1417 { | |
| 1418 //printf( "correlation %d: %lf\n", i, alltranscripts[i].correlationScore ) ; | |
| 1419 /*for ( int l = 0 ; l < subexonInd.size() ; ++l ) | |
| 1420 { | |
| 1421 for ( int m = l ; m < subexonInd.size() ; ++m ) | |
| 1422 printf( "%lf ", seCorrelation.Query( l, m ) ) ; | |
| 1423 printf( "\n" ) ; | |
| 1424 }*/ | |
| 1425 | |
| 1426 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 1427 { | |
| 1428 int a = tc[j].i ; | |
| 1429 int b = tc[j].j ; | |
| 1430 | |
| 1431 //printf( "try set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; | |
| 1432 //alltranscripts[i].seVector.Print() ; | |
| 1433 //constraints.constraints[a].vector.Print() ; | |
| 1434 //constraints.constraints[b].vector.Print() ; | |
| 1435 if ( IsConstraintInTranscript( alltranscripts[i], constraints.constraints[a] ) == 1 | |
| 1436 && IsConstraintInTranscript( alltranscripts[i], constraints.constraints[b] ) == 1 ) | |
| 1437 { | |
| 1438 //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; | |
| 1439 btable[i].Set( j ) ; | |
| 1440 btableSet = true ; | |
| 1441 } | |
| 1442 } | |
| 1443 transcriptSeCnt[i] = alltranscripts[i].seVector.Count() ; | |
| 1444 } | |
| 1445 if ( btableSet == false ) | |
| 1446 { | |
| 1447 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1448 btable[i].Release() ; | |
| 1449 delete[] btable ; | |
| 1450 return ; | |
| 1451 } | |
| 1452 | |
| 1453 double maxAbundance = -1 ; // The abundance of the most-abundant transcript | |
| 1454 double *adjustScore = new double[atcnt] ; | |
| 1455 memset( adjustScore, 0, sizeof( double ) * atcnt ) ; | |
| 1456 if ( atcnt > 0 /*&& alltranscripts[0].abundance == -1*/ ) | |
| 1457 { | |
| 1458 struct _pair32 *chain = new struct _pair32[seCnt] ; | |
| 1459 bool *covered = new bool[seCnt] ; | |
| 1460 bool *usedConstraints = new bool[constraints.constraints.size() ] ; | |
| 1461 std::vector<BitTable> togetherChain ; // those subexons is more likely to show up in the same transcript, like an IR with overhang, should be together to represent a 3'/5'-end | |
| 1462 | |
| 1463 /*lowCovSubexon.Init( seCnt ) ; | |
| 1464 double *avgDepth = new double[seCnt ] ; | |
| 1465 | |
| 1466 memset( avgDepth, 0, sizeof( double ) * seCnt ) ; | |
| 1467 int size = constraints.constraints.size() ; | |
| 1468 for ( i = 0 ; i < size ; ++i ) | |
| 1469 { | |
| 1470 std::vector<int> subexonIdx ; | |
| 1471 constraints.constraints[i].GetOnesIndices( subexonIdx ) ; | |
| 1472 int seIdxCnt = subexonidx.size() ; | |
| 1473 for ( j = 0 ; j < seIdxCnt ; ++j ) | |
| 1474 avgDepth[ subexonidx[j] ] += constraints.constraints[i].support ; | |
| 1475 } | |
| 1476 for ( i = 0 ; i < seCnt ; ++i ) | |
| 1477 { | |
| 1478 if ( avgDepth[i] * alignments.readLen / (double)( subexons[i].end - subexons[i].start + 1 ) < 1 ) | |
| 1479 }*/ | |
| 1480 | |
| 1481 struct _pair32 firstRegion, lastRegion ; | |
| 1482 | |
| 1483 for ( i = 0 ; i < seCnt ; ) | |
| 1484 { | |
| 1485 for ( j = i + 1 ; j < seCnt ; ++j ) | |
| 1486 { | |
| 1487 if ( subexons[j].start > subexons[j - 1].end + 1 ) | |
| 1488 break ; | |
| 1489 } | |
| 1490 | |
| 1491 | |
| 1492 int cnt = 0 ; | |
| 1493 for ( k = i ; k < j ; ++k ) | |
| 1494 { | |
| 1495 if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 ) | |
| 1496 || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 ) | |
| 1497 || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) ) | |
| 1498 ++cnt ; | |
| 1499 } | |
| 1500 | |
| 1501 if ( cnt <= 1 ) | |
| 1502 { | |
| 1503 i = j ; | |
| 1504 continue ; | |
| 1505 } | |
| 1506 | |
| 1507 BitTable tmpTable( seCnt ) ; | |
| 1508 for ( k = i ; k < j ; ++k ) | |
| 1509 { | |
| 1510 if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 ) | |
| 1511 || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 ) | |
| 1512 || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) ) | |
| 1513 tmpTable.Set( k ) ; | |
| 1514 } | |
| 1515 togetherChain.push_back( tmpTable ) ; | |
| 1516 i = j ; | |
| 1517 } | |
| 1518 | |
| 1519 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1520 { | |
| 1521 double value = inf ; | |
| 1522 int tag = -1 ; | |
| 1523 | |
| 1524 alltranscripts[i].abundance = 0 ; | |
| 1525 alltranscripts[i].constraintsSupport = new double[tcCnt] ; | |
| 1526 | |
| 1527 std::vector<int> subexonIdx ; | |
| 1528 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; | |
| 1529 int seIdxCnt = subexonIdx.size() ; | |
| 1530 transcriptLength[i] = 0 ; | |
| 1531 | |
| 1532 firstRegion.a = subexonIdx[0] ; | |
| 1533 for ( j = 1 ; j < seIdxCnt ; ++j ) | |
| 1534 { | |
| 1535 if ( subexons[ subexonIdx[j] ].start > subexons[ subexonIdx[j - 1] ].end + 1 ) | |
| 1536 break ; | |
| 1537 } | |
| 1538 firstRegion.b = subexonIdx[j - 1] ; | |
| 1539 lastRegion.b = subexonIdx[ seIdxCnt - 1 ] ; | |
| 1540 for ( j = seIdxCnt - 2 ; j >= 0 ; --j ) | |
| 1541 { | |
| 1542 if ( subexons[ subexonIdx[j] ].end < subexons[ subexonIdx[j + 1] ].start - 1 ) | |
| 1543 break ; | |
| 1544 } | |
| 1545 lastRegion.a = subexonIdx[j + 1] ; | |
| 1546 | |
| 1547 for ( j = 0 ; j < seIdxCnt ; ++j ) | |
| 1548 transcriptLength[i] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; | |
| 1549 | |
| 1550 //for ( j = firstRegion.b ; j < lastRegion.a ; ++j ) | |
| 1551 for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) | |
| 1552 { | |
| 1553 chain[j].a = subexonIdx[j] ; | |
| 1554 chain[j].b = subexonIdx[j + 1] ; | |
| 1555 covered[j] = false ; | |
| 1556 } | |
| 1557 memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ; | |
| 1558 int compatibleCnt = 0 ; | |
| 1559 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 1560 { | |
| 1561 alltranscripts[i].constraintsSupport[j] = 0 ; | |
| 1562 if ( btable[i].Test(j) && tc[j].abundance > 0 ) | |
| 1563 { | |
| 1564 ++compatibleCnt ; | |
| 1565 double adjustAbundance = tc[j].abundance ; | |
| 1566 if ( seIdxCnt > 1 ) | |
| 1567 { | |
| 1568 if ( tc[j].i == tc[j].j | |
| 1569 && ( constraints.constraints[ tc[j].i ].first + | |
| 1570 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first | |
| 1571 || constraints.constraints[ tc[j].i ].first + | |
| 1572 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) ) | |
| 1573 { | |
| 1574 adjustAbundance = inf ; | |
| 1575 } | |
| 1576 else if ( tc[j].i != tc[j].j | |
| 1577 && ( constraints.constraints[ tc[j].i ].first + | |
| 1578 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first | |
| 1579 || constraints.constraints[ tc[j].i ].first + | |
| 1580 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) ) | |
| 1581 { | |
| 1582 adjustAbundance = constraints.constraints[ tc[j].j ].normAbund ; | |
| 1583 } | |
| 1584 else if ( tc[j].i != tc[j].j | |
| 1585 && ( constraints.constraints[ tc[j].j ].first + | |
| 1586 constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].first | |
| 1587 || constraints.constraints[ tc[j].j ].first + | |
| 1588 constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].last ) ) | |
| 1589 { | |
| 1590 adjustAbundance = constraints.constraints[ tc[j].i ].normAbund ; | |
| 1591 } | |
| 1592 } | |
| 1593 if ( adjustAbundance < value ) | |
| 1594 /*!( seIdxCnt > 1 | |
| 1595 && ( ( ( constraints.constraints[ tc[j].i ].first >= firstRegion.a && constraints.constraints[ tc[j].i ].last <= firstRegion.b ) | |
| 1596 && ( constraints.constraints[ tc[j].j ].first >= firstRegion.a && constraints.constraints[ tc[j].j ].last <= firstRegion.b ) ) | |
| 1597 || ( ( constraints.constraints[ tc[j].i ].first >= lastRegion.a && constraints.constraints[ tc[j].i ].last <= lastRegion.b ) | |
| 1598 && ( constraints.constraints[ tc[j].j ].first >= lastRegion.a && constraints.constraints[ tc[j].j ].last <= lastRegion.b ) ) ) ) | |
| 1599 )*/ | |
| 1600 { | |
| 1601 // Not use the constraints totally within the 3'/5'-end in the transcript | |
| 1602 value = adjustAbundance ; | |
| 1603 tag = j ; | |
| 1604 } | |
| 1605 avgTranscriptAbundance[i] += tc[j].abundance ; | |
| 1606 | |
| 1607 if ( !usedConstraints[ tc[j].i ] ) | |
| 1608 { | |
| 1609 struct _constraint &c = constraints.constraints[ tc[j].i ] ; | |
| 1610 for ( k = 0 ; k < seIdxCnt - 1 ; ++k ) | |
| 1611 { | |
| 1612 // Note that since the constraint is already compatible with the txpt, | |
| 1613 // chain[k].a/b must be also adjacent in this constraint. | |
| 1614 if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) ) | |
| 1615 covered[k] = true ; | |
| 1616 } | |
| 1617 usedConstraints[ tc[j].i ] = true ; | |
| 1618 } | |
| 1619 | |
| 1620 if ( !usedConstraints[ tc[j].j ] ) | |
| 1621 { | |
| 1622 struct _constraint &c = constraints.constraints[ tc[j].j ] ; | |
| 1623 for ( k = 0 ; k < seIdxCnt - 1 ; ++k ) | |
| 1624 { | |
| 1625 if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) ) | |
| 1626 covered[k] = true ; | |
| 1627 } | |
| 1628 usedConstraints[ tc[j].j ] = true ; | |
| 1629 } | |
| 1630 } | |
| 1631 } | |
| 1632 | |
| 1633 // Get some penalty if something should together did not show up together | |
| 1634 int size = togetherChain.size() ; | |
| 1635 if ( size > 0 ) | |
| 1636 { | |
| 1637 BitTable bufferTable( seCnt ) ; | |
| 1638 for ( j = 0 ; j < size ; ++j ) | |
| 1639 { | |
| 1640 bufferTable.Assign( togetherChain[j] ) ; | |
| 1641 bufferTable.And( alltranscripts[i].seVector ) ; | |
| 1642 //if ( !bufferTable.IsAllZero() && !bufferTable.IsEqual( togetherChain[j] ) ) | |
| 1643 // value /= 2 ; | |
| 1644 | |
| 1645 if ( !bufferTable.IsAllZero() ) | |
| 1646 { | |
| 1647 if ( bufferTable.IsEqual( togetherChain[j] ) ) | |
| 1648 //printf( "nice together!\n" ) ; | |
| 1649 ; | |
| 1650 else | |
| 1651 value /= 2 ; | |
| 1652 //printf( "bad together!\n" ) ; | |
| 1653 } | |
| 1654 } | |
| 1655 bufferTable.Release() ; | |
| 1656 } | |
| 1657 | |
| 1658 | |
| 1659 // Every two-subexon chain should be covered by some reads if a transcript is expressed highly enough | |
| 1660 int cnt = 0 ; | |
| 1661 for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) | |
| 1662 if ( covered[j] == false ) // && j >= firstRegion.b && j <= lastRegion.a - 1 ) | |
| 1663 { | |
| 1664 value = 0 ; | |
| 1665 } | |
| 1666 else | |
| 1667 ++cnt ; | |
| 1668 if ( seIdxCnt > 1 ) | |
| 1669 coveredPortion[i] = (double)cnt / (double)( seIdxCnt - 1 ) ; | |
| 1670 else | |
| 1671 coveredPortion[i] = 1 ; | |
| 1672 if ( coveredPortion[i] == 0 ) | |
| 1673 coveredPortion[i] = (double)0.5 / ( seIdxCnt ) ; | |
| 1674 | |
| 1675 // For short subexon (readLength-subexon_length-1>30), we further require a constraint cover three conseuctive subexon | |
| 1676 /*memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ; | |
| 1677 for ( j = 1 ; j < seIdxCnt - 1 ; ++j ) | |
| 1678 { | |
| 1679 int k = subexonIdx[j] ; | |
| 1680 if ( alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 <= 30 ) | |
| 1681 continue ; | |
| 1682 // We need at least one of the side subexons are adjacent to the center one. | |
| 1683 if ( subexons[ subexonIdx[j - 1] ].end + 1 < subexons[k].start && subexons[k].end + 1 < subexons[ subexonIdx[j + 1] ].start ) | |
| 1684 continue ; | |
| 1685 | |
| 1686 int l = 0 ; | |
| 1687 for ( l = 0 ; l < tcCnt ; ++l ) | |
| 1688 { | |
| 1689 if ( btable[i].Test(l) && tc[l].abundance > 0 ) | |
| 1690 { | |
| 1691 if ( !usedConstraints[ tc[l].i ] ) | |
| 1692 { | |
| 1693 struct _constraint &c = constraints.constraints[ tc[l].i ] ; | |
| 1694 if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) && | |
| 1695 c.vector.Test( subexonIdx[j + 1] ) ) | |
| 1696 break ; | |
| 1697 usedConstraints[ tc[l].i ] = true ; | |
| 1698 } | |
| 1699 | |
| 1700 if ( !usedConstraints[ tc[l].j ] ) | |
| 1701 { | |
| 1702 struct _constraint &c = constraints.constraints[ tc[l].j ] ; | |
| 1703 if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) && | |
| 1704 c.vector.Test( subexonIdx[j + 1] ) ) | |
| 1705 break ; | |
| 1706 usedConstraints[ tc[l].j ] = true ; | |
| 1707 } | |
| 1708 } | |
| 1709 } | |
| 1710 // It is not covered | |
| 1711 if ( l >= tcCnt ) | |
| 1712 { | |
| 1713 int residual = alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 ; | |
| 1714 //printf( "residual: %d %d %lf\n", k, residual, value ) ; | |
| 1715 if ( value * residual > 2 ) | |
| 1716 { | |
| 1717 value = 1 / (double)residual ; | |
| 1718 } | |
| 1719 } | |
| 1720 }*/ | |
| 1721 | |
| 1722 if ( tag == -1 ) | |
| 1723 value = 0 ; | |
| 1724 if ( value > maxAbundance ) | |
| 1725 maxAbundance = value ; | |
| 1726 transcriptAbundance[i] = value ; | |
| 1727 if ( tag != -1 ) | |
| 1728 avgTranscriptAbundance[i] /= compatibleCnt ; | |
| 1729 | |
| 1730 //printf( "abundance %d: %lf %lf ", i, value, avgTranscriptAbundance[i] ) ; | |
| 1731 //alltranscripts[i].seVector.Print() ; | |
| 1732 } | |
| 1733 if ( maxAbundance == 0 ) | |
| 1734 { | |
| 1735 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1736 { | |
| 1737 transcriptAbundance[i] = coveredPortion[i] ; | |
| 1738 } | |
| 1739 maxAbundance = 1 ; | |
| 1740 } | |
| 1741 //printf( "%s: %lf\n", __func__, maxAbundance ) ; | |
| 1742 int size = togetherChain.size() ; | |
| 1743 for ( j = 0 ; j < size ; ++j ) | |
| 1744 togetherChain[j].Release() ; | |
| 1745 delete[] usedConstraints ; | |
| 1746 delete[] covered ; | |
| 1747 delete[] chain ; | |
| 1748 } | |
| 1749 else | |
| 1750 { | |
| 1751 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1752 { | |
| 1753 transcriptAbundance[i] = alltranscripts[i].abundance ; | |
| 1754 if ( transcriptAbundance[i] > maxAbundance ) | |
| 1755 maxAbundance = transcriptAbundance[i] ; | |
| 1756 coveredPortion[i] = 1 ; | |
| 1757 } | |
| 1758 if ( maxAbundance == 0 ) | |
| 1759 maxAbundance = 1 ; | |
| 1760 } | |
| 1761 | |
| 1762 // Obtain the prefix, suffix information of the transcripts. | |
| 1763 int *nextSuffix, *nextPrefix ; | |
| 1764 struct _pair32 *txptRank ; | |
| 1765 nextSuffix = new int[atcnt] ; | |
| 1766 nextPrefix = new int[atcnt] ; | |
| 1767 txptRank = new struct _pair32[atcnt] ; | |
| 1768 memset( nextSuffix, -1, sizeof( int ) * atcnt ) ; | |
| 1769 memset( nextPrefix, -1, sizeof( int ) * atcnt ) ; | |
| 1770 /*for ( i = 0 ; i < atcnt ; ++i ) | |
| 1771 { | |
| 1772 std::vector<int> subexonIdx ; | |
| 1773 txptRank[i].a = i ; | |
| 1774 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; | |
| 1775 txptRank[i].b = subexonIdx.size() ; | |
| 1776 } | |
| 1777 qsort( txptRank, atcnt, sizeof( struct _pair32 ), CompPairsByB) ; | |
| 1778 BitTable bufferTable( seCnt ) ; | |
| 1779 for ( i = atcnt - 1 ; i >= 0 ; --i ) | |
| 1780 { | |
| 1781 int a = txptRank[i].a ; | |
| 1782 for ( j = i - 1 ; j >= 0 ; --j ) | |
| 1783 { | |
| 1784 if ( txptRank[i].b == txptRank[j].b ) | |
| 1785 continue ; | |
| 1786 | |
| 1787 int b = txptRank[j].a ; | |
| 1788 | |
| 1789 if ( alltranscripts[b].last != alltranscripts[a].last ) | |
| 1790 continue ; | |
| 1791 | |
| 1792 bufferTable.Assign( alltranscripts[a].seVector ) ; | |
| 1793 bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ; | |
| 1794 if ( bufferTable.IsEqual( alltranscripts[b].seVector ) ) | |
| 1795 { | |
| 1796 nextSuffix[a] = b ; | |
| 1797 break ; | |
| 1798 } | |
| 1799 } | |
| 1800 } | |
| 1801 for ( i = atcnt - 1 ; i >= 0 ; --i ) | |
| 1802 { | |
| 1803 int a = txptRank[i].a ; | |
| 1804 for ( j = i - 1 ; j >= 0 ; --j ) | |
| 1805 { | |
| 1806 if ( txptRank[i].b == txptRank[j].b ) | |
| 1807 continue ; | |
| 1808 | |
| 1809 int b = txptRank[j].a ; | |
| 1810 | |
| 1811 if ( alltranscripts[b].first != alltranscripts[a].first ) | |
| 1812 continue ; | |
| 1813 | |
| 1814 bufferTable.Assign( alltranscripts[a].seVector ) ; | |
| 1815 bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ; | |
| 1816 if ( bufferTable.IsEqual( alltranscripts[b].seVector ) ) | |
| 1817 { | |
| 1818 nextPrefix[a] = b ; | |
| 1819 break ; | |
| 1820 } | |
| 1821 } | |
| 1822 } | |
| 1823 | |
| 1824 bufferTable.Release() ;*/ | |
| 1825 delete[] txptRank ; | |
| 1826 | |
| 1827 // Quantative Set-Cover | |
| 1828 int iterCnt = -1 ; | |
| 1829 double *coverCnt = new double[atcnt] ; | |
| 1830 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1831 coverCnt[i] = -1 ; | |
| 1832 int *list = new int[atcnt] ; | |
| 1833 int listCnt ; | |
| 1834 | |
| 1835 while ( 1 ) | |
| 1836 { | |
| 1837 double max = -1 ; | |
| 1838 int maxtag = -1 ; | |
| 1839 double maxcnt = -1 ; | |
| 1840 ++iterCnt ; | |
| 1841 | |
| 1842 // Find the optimal candidate. | |
| 1843 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1844 { | |
| 1845 double value = inf ; | |
| 1846 double cnt = 0 ; | |
| 1847 | |
| 1848 if ( coverCnt[i] == -1 ) | |
| 1849 { | |
| 1850 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 1851 { | |
| 1852 if ( tc[j].abundance > 0 && btable[i].Test( j ) ) | |
| 1853 { | |
| 1854 cnt += tc[j].effectiveCount ; | |
| 1855 } | |
| 1856 } | |
| 1857 /*else | |
| 1858 { | |
| 1859 std::vector<int> tcIdx ; | |
| 1860 btable[i].GetOnesIndices( tcIdx ) ; | |
| 1861 int size = tcIdx.size() ; | |
| 1862 for ( j = 0 ; j < size ; ++j ) | |
| 1863 { | |
| 1864 if ( tc[ tcIdx[j] ].abundance > 0 ) | |
| 1865 { | |
| 1866 cnt += tc[ tcIdx[j] ].effectiveCount ; | |
| 1867 } | |
| 1868 } | |
| 1869 }*/ | |
| 1870 coverCnt[i] = cnt ; | |
| 1871 } | |
| 1872 else | |
| 1873 { | |
| 1874 cnt = coverCnt[i] ; | |
| 1875 } | |
| 1876 | |
| 1877 value = transcriptAbundance[i] ; | |
| 1878 if ( cnt < 1 ) // This transcript does not satisfy any undepleted constraints. | |
| 1879 continue ; | |
| 1880 cnt *= coveredPortion[i] ; | |
| 1881 | |
| 1882 double weight = 1 ; //* seCnt / transcriptSeCnt[i] ; | |
| 1883 //if ( maxAbundance >= 1 && value / maxAbundance >= 0.2 ) | |
| 1884 // seCntAdjust = sqrt( (double)( transcriptSeCnt[i] ) / seCnt ) ;//< 0.5 ? 0.5 : (double)( transcriptSeCnt[i] ) / seCnt ; | |
| 1885 | |
| 1886 if ( alltranscripts[i].FPKM > 0 && sampleCnt > 1 ) | |
| 1887 weight = ( 1 + alltranscripts[i].FPKM / sampleCnt ) ; | |
| 1888 | |
| 1889 double score = ComputeScore( cnt, weight, value, maxAbundance, alltranscripts[i].correlationScore ) ; | |
| 1890 if ( cnt > maxcnt ) | |
| 1891 maxcnt = cnt ; | |
| 1892 score += adjustScore[i] ; | |
| 1893 if ( score > max ) | |
| 1894 { | |
| 1895 max = score ; | |
| 1896 maxtag = i ; | |
| 1897 } | |
| 1898 else if ( score == max ) | |
| 1899 { | |
| 1900 if ( avgTranscriptAbundance[maxtag] < avgTranscriptAbundance[i] ) | |
| 1901 { | |
| 1902 max = score ; | |
| 1903 maxtag = i ; | |
| 1904 } | |
| 1905 } | |
| 1906 //printf( "score: %d %lf -> %lf\n", i, cnt, score ) ; | |
| 1907 } | |
| 1908 | |
| 1909 if ( maxcnt == 0 || maxtag == -1 ) | |
| 1910 break ; | |
| 1911 | |
| 1912 // Find the constraint that should be depleted. | |
| 1913 double update = inf ; | |
| 1914 int updateTag = 0 ; | |
| 1915 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 1916 { | |
| 1917 if ( btable[ maxtag ].Test( j ) && tc[j].abundance > 0 && | |
| 1918 tc[j].abundance <= update ) | |
| 1919 { | |
| 1920 update = tc[j].abundance ; | |
| 1921 updateTag = j ; | |
| 1922 } | |
| 1923 } | |
| 1924 | |
| 1925 // Search suffix and prefix to see whether these fit better. | |
| 1926 int p = nextSuffix[ maxtag] ; | |
| 1927 while ( p != -1 ) | |
| 1928 { | |
| 1929 if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag] | |
| 1930 && btable[p].Test( updateTag ) ) | |
| 1931 { | |
| 1932 //printf( "%d\n", p ) ; | |
| 1933 maxtag = p ; | |
| 1934 break ; | |
| 1935 } | |
| 1936 p = nextSuffix[p] ; | |
| 1937 } | |
| 1938 p = nextPrefix[maxtag] ; | |
| 1939 while ( p != -1 ) | |
| 1940 { | |
| 1941 if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag] | |
| 1942 && btable[p].Test( updateTag ) ) | |
| 1943 { | |
| 1944 maxtag = p ; | |
| 1945 break ; | |
| 1946 } | |
| 1947 p = nextPrefix[p] ; | |
| 1948 } | |
| 1949 | |
| 1950 | |
| 1951 // Update the abundance. | |
| 1952 int supportCnt = 0 ; | |
| 1953 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 1954 { | |
| 1955 if ( btable[maxtag].Test( j ) ) | |
| 1956 { | |
| 1957 if ( tc[j].abundance > 0 ) | |
| 1958 { | |
| 1959 tc[j].abundance -= 1 * update ; | |
| 1960 double factor = tc[j].effectiveCount ; | |
| 1961 double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ; | |
| 1962 alltranscripts[maxtag].constraintsSupport[j] += tmp ; | |
| 1963 alltranscripts[maxtag].abundance += tmp ; | |
| 1964 | |
| 1965 if ( tc[j].abundance <= 0 ) | |
| 1966 { | |
| 1967 int l ; | |
| 1968 for ( l = 0 ; l < atcnt ; ++l ) | |
| 1969 { | |
| 1970 if ( btable[l].Test(j) ) | |
| 1971 coverCnt[l] -= tc[j].effectiveCount ; | |
| 1972 } | |
| 1973 } | |
| 1974 ++supportCnt ; | |
| 1975 } | |
| 1976 else if ( alltranscripts[maxtag].constraintsSupport[j] == 0 ) | |
| 1977 { | |
| 1978 double sum = 0 ; | |
| 1979 double takeOut = 0 ; | |
| 1980 double factor = tc[j].effectiveCount ; | |
| 1981 listCnt = 0 ; | |
| 1982 for ( i = 0 ; i < atcnt ; ++i ) | |
| 1983 { | |
| 1984 if ( i == maxtag ) | |
| 1985 continue ; | |
| 1986 | |
| 1987 if ( alltranscripts[i].abundance > 0 && btable[i].Test(j) ) | |
| 1988 { | |
| 1989 sum += alltranscripts[i].constraintsSupport[j] ; | |
| 1990 | |
| 1991 double tmp = ( alltranscripts[i].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) * | |
| 1992 transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) | |
| 1993 - alltranscripts[maxtag].constraintsSupport[j] ; | |
| 1994 if ( tmp > 0 ) | |
| 1995 { | |
| 1996 list[ listCnt ] = i ; | |
| 1997 ++listCnt ; | |
| 1998 takeOut += tmp ; //alltranscripts[i].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) ; | |
| 1999 } | |
| 2000 } | |
| 2001 } | |
| 2002 | |
| 2003 double ratio = 1 ; | |
| 2004 double takeOutFactor = 0.5 ; | |
| 2005 if ( update < tc[j].normAbund ) | |
| 2006 { | |
| 2007 if ( takeOut > ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor ) | |
| 2008 ratio = ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor / takeOut ; | |
| 2009 } | |
| 2010 else | |
| 2011 { | |
| 2012 if ( takeOut > ( tc[j].support * factor ) * takeOutFactor ) | |
| 2013 ratio = ( tc[j].support * factor ) * takeOutFactor / takeOut ; | |
| 2014 } | |
| 2015 | |
| 2016 if ( 1 ) //update < tc[j].normAbund ) | |
| 2017 { | |
| 2018 for ( i = 0 ; i < listCnt ; ++i ) | |
| 2019 { | |
| 2020 //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) * | |
| 2021 // ( alltranscripts[ list[i] ].constraintsSupport[j] / sum ) ; | |
| 2022 //if ( alltranscripts[ list[i] ].constraintsSupport[j] < tmp ) | |
| 2023 // printf( "WARNING! %lf %lf, %lf\n", alltranscripts[ list[i] ].constraintsSupport[j], sum, tmp ) ; | |
| 2024 | |
| 2025 //double tmp = alltranscripts[ list[i] ].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) * ratio ; | |
| 2026 double tmp = ( ( alltranscripts[ list[i] ].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) * | |
| 2027 transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) | |
| 2028 - alltranscripts[maxtag].constraintsSupport[j] ) * ratio ; | |
| 2029 | |
| 2030 | |
| 2031 alltranscripts[ list[i] ].constraintsSupport[j] -= tmp ; | |
| 2032 alltranscripts[ list[i] ].abundance -= tmp ; | |
| 2033 } | |
| 2034 //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ; | |
| 2035 //printf( "%lf %lf. %lf %lf\n", takeOut, ratio, update, tc[j].normAbund ) ; | |
| 2036 double tmp = takeOut * ratio ; | |
| 2037 alltranscripts[maxtag].constraintsSupport[j] += tmp ; | |
| 2038 alltranscripts[maxtag].abundance += tmp ; | |
| 2039 } | |
| 2040 /*else | |
| 2041 { | |
| 2042 double tmp = ( tc[j].support / (double)( listCnt + 1 ) ) * factor ; | |
| 2043 for ( i = 0 ; i < listCnt ; ++i ) | |
| 2044 { | |
| 2045 alltranscripts[ list[i] ].abundance -= alltranscripts[ list[i] ].constraintsSupport[j] ; | |
| 2046 | |
| 2047 alltranscripts[ list[i] ].constraintsSupport[j] = tmp ; | |
| 2048 alltranscripts[ list[i] ].abundance += tmp ; | |
| 2049 } | |
| 2050 alltranscripts[maxtag].constraintsSupport[j] += tmp ; | |
| 2051 alltranscripts[maxtag].abundance += tmp ; | |
| 2052 }*/ | |
| 2053 | |
| 2054 } | |
| 2055 } | |
| 2056 | |
| 2057 if ( tc[j].abundance < 0 ) | |
| 2058 { | |
| 2059 tc[j].abundance = 0 ; | |
| 2060 | |
| 2061 } | |
| 2062 } | |
| 2063 tc[ updateTag ].abundance = 0 ; | |
| 2064 if ( supportCnt == 0 ) | |
| 2065 break ; | |
| 2066 //adjustScore[maxtag] += 1 / (double)tcCnt ; | |
| 2067 //printf( "maxtag=%d %lf %d\n", maxtag, update, updateTag ) ; | |
| 2068 } | |
| 2069 | |
| 2070 for ( i = 0 ; i < atcnt ; ++i ) | |
| 2071 { | |
| 2072 if ( alltranscripts[i].abundance > 0 ) | |
| 2073 { | |
| 2074 struct _transcript nt = alltranscripts[i] ; | |
| 2075 nt.seVector.Nullify() ; | |
| 2076 nt.seVector.Duplicate( alltranscripts[i].seVector ) ; | |
| 2077 nt.constraintsSupport = NULL ; | |
| 2078 if ( transcriptAbundance[i] == 0 ) | |
| 2079 nt.correlationScore = -1 ; | |
| 2080 else | |
| 2081 nt.correlationScore = 0 ; | |
| 2082 nt.id = i ; | |
| 2083 transcripts.push_back( nt ) ; | |
| 2084 } | |
| 2085 } | |
| 2086 | |
| 2087 // Release the memory of btable. | |
| 2088 for ( i = 0 ; i < atcnt ; ++i ) | |
| 2089 { | |
| 2090 delete[] alltranscripts[i].constraintsSupport ; | |
| 2091 btable[i].Release() ; | |
| 2092 } | |
| 2093 delete[] btable ; | |
| 2094 | |
| 2095 delete[] list ; | |
| 2096 delete[] transcriptSeCnt ; | |
| 2097 delete[] transcriptLength ; | |
| 2098 delete[] transcriptAbundance ; | |
| 2099 delete[] avgTranscriptAbundance ; | |
| 2100 delete[] coveredPortion ; | |
| 2101 delete[] adjustScore ; | |
| 2102 delete[] coverCnt ; | |
| 2103 | |
| 2104 delete[] nextPrefix ; | |
| 2105 delete[] nextSuffix ; | |
| 2106 | |
| 2107 // Redistribute weight if there is some constraints that are unbalanced. | |
| 2108 /*tcnt = transcripts.size() ; | |
| 2109 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2110 { | |
| 2111 int maxRatio = -1 ; | |
| 2112 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2113 if ( transcripts[i].constraintsSupport[j] > 0 ) | |
| 2114 { | |
| 2115 double factor = tc[j].effectiveCount ; | |
| 2116 if ( transcripts[]) | |
| 2117 } | |
| 2118 }*/ | |
| 2119 } | |
| 2120 | |
| 2121 void TranscriptDecider::AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts ) | |
| 2122 { | |
| 2123 int tcnt = transcripts.size() ; | |
| 2124 int size ; | |
| 2125 int i, j ; | |
| 2126 if ( tcnt <= 0 ) | |
| 2127 return ; | |
| 2128 | |
| 2129 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ; | |
| 2130 int tcCnt = tc.size() ; // transcript constraints | |
| 2131 | |
| 2132 BitTable *btable = new BitTable[ tcnt ] ; | |
| 2133 int *transcriptLength = new int[tcnt] ; | |
| 2134 int *compatibleList = new int[tcnt] ; | |
| 2135 double *rho = new double[tcnt] ; // the abundance. | |
| 2136 int iterCnt = 0 ; | |
| 2137 | |
| 2138 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2139 transcripts[i].constraintsSupport = new double[ tcCnt ] ; | |
| 2140 | |
| 2141 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2142 { | |
| 2143 btable[i].Init( tcCnt ) ; | |
| 2144 double min = -1 ; | |
| 2145 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2146 { | |
| 2147 int a = tc[j].i ; | |
| 2148 int b = tc[j].j ; | |
| 2149 | |
| 2150 if ( IsConstraintInTranscript( transcripts[i], constraints.constraints[a] ) == 1 | |
| 2151 && IsConstraintInTranscript( transcripts[i], constraints.constraints[b] ) == 1 ) | |
| 2152 { | |
| 2153 //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; | |
| 2154 btable[i].Set( j ) ; | |
| 2155 | |
| 2156 if ( min == -1 || tc[j].normAbund < min ) | |
| 2157 min = tc[j].normAbund ; | |
| 2158 } | |
| 2159 } | |
| 2160 | |
| 2161 std::vector<int> subexonIdx ; | |
| 2162 transcripts[i].seVector.GetOnesIndices( subexonIdx ) ; | |
| 2163 int subexonIdxCnt = subexonIdx.size() ; | |
| 2164 int len = 0 ; | |
| 2165 for ( j = 0 ; j < subexonIdxCnt ; ++j ) | |
| 2166 len += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; | |
| 2167 transcriptLength[i] = len - alignments.fragLen + 2 * alignments.fragStdev ; | |
| 2168 if ( transcriptLength[i] < 1 ) | |
| 2169 transcriptLength[i] = 1 ; | |
| 2170 rho[i] = transcripts[i].abundance / transcriptLength[i] ; // use the rough estimation generated before. | |
| 2171 if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen ) | |
| 2172 rho[i] = 0.1 / (double)alignments.readLen ; | |
| 2173 } | |
| 2174 | |
| 2175 while ( 1 ) | |
| 2176 { | |
| 2177 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2178 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2179 { | |
| 2180 transcripts[i].constraintsSupport[j] = 0 ; | |
| 2181 } | |
| 2182 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2183 { | |
| 2184 int clCnt = 0 ; | |
| 2185 double sum = 0 ; | |
| 2186 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2187 { | |
| 2188 if ( btable[i].Test(j) ) | |
| 2189 { | |
| 2190 compatibleList[ clCnt ] = i ; | |
| 2191 ++clCnt ; | |
| 2192 sum += rho[i] ; | |
| 2193 } | |
| 2194 } | |
| 2195 | |
| 2196 for ( i = 0 ; i < clCnt ; ++i ) | |
| 2197 { | |
| 2198 double factor = tc[j].effectiveCount ; | |
| 2199 transcripts[ compatibleList[i] ].constraintsSupport[j] = ( rho[ compatibleList[i] ] / sum ) * tc[j].support * factor ; | |
| 2200 } | |
| 2201 } | |
| 2202 | |
| 2203 double diff = 0 ; | |
| 2204 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2205 { | |
| 2206 double newAbund = 0 ; | |
| 2207 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2208 newAbund += transcripts[i].constraintsSupport[j] ; | |
| 2209 double old = rho[i] ; | |
| 2210 rho[i] = newAbund / transcriptLength[i] ; | |
| 2211 //printf( "rho[%d]=%lf\n", i, rho[i] ) ; | |
| 2212 if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen ) | |
| 2213 rho[i] = 0.1 / (double)alignments.readLen ; | |
| 2214 | |
| 2215 double tmp = ( old - rho[i] ) ; | |
| 2216 diff += tmp < 0 ? -tmp : tmp ; | |
| 2217 } | |
| 2218 //printf( "%lf\n", diff ) ; | |
| 2219 if ( diff < 1e-3) | |
| 2220 break ; | |
| 2221 | |
| 2222 ++iterCnt ; | |
| 2223 if ( iterCnt >= 1000 ) | |
| 2224 break ; | |
| 2225 } | |
| 2226 | |
| 2227 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2228 { | |
| 2229 //printf( "%lf=>", transcripts[i].abundance ) ; | |
| 2230 transcripts[i].abundance = 0 ; | |
| 2231 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2232 { | |
| 2233 transcripts[i].abundance += transcripts[i].constraintsSupport[j] ; | |
| 2234 } | |
| 2235 //printf( "%lf. (%lf)\n", transcripts[i].abundance, transcripts[i].correlationScore ) ; | |
| 2236 //transcripts[i].seVector.Print() ; | |
| 2237 } | |
| 2238 | |
| 2239 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2240 delete[] transcripts[i].constraintsSupport ; | |
| 2241 | |
| 2242 // Release the memory of btable. | |
| 2243 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2244 { | |
| 2245 btable[i].Release() ; | |
| 2246 } | |
| 2247 delete[] compatibleList ; | |
| 2248 delete[] btable ; | |
| 2249 delete[] transcriptLength ; | |
| 2250 delete[] rho ; | |
| 2251 } | |
| 2252 | |
| 2253 int TranscriptDecider::RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, | |
| 2254 std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints ) | |
| 2255 { | |
| 2256 int i, j, k ; | |
| 2257 int tcnt = transcripts.size() ; | |
| 2258 if ( tcnt == 0 ) | |
| 2259 return 0 ; | |
| 2260 int tcCnt = constraints.matePairs.size() ; | |
| 2261 | |
| 2262 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ; | |
| 2263 std::vector<struct _constraint> &scc = constraints.constraints ; //single-end constraints.constraints | |
| 2264 | |
| 2265 // Remove transcripts whose FPKM are too small. | |
| 2266 //printf( "%d %d\n", usedGeneId, baseGeneId ) ; | |
| 2267 double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ; | |
| 2268 int *geneMaxFPKMTag = new int[usedGeneId - baseGeneId ] ; | |
| 2269 double *nonOverlapMaxFPKM = new double[ usedGeneId - baseGeneId ] ; // the max FPKM among all the transcripts not overlapping with maxFPKMTag transcripts. | |
| 2270 memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; | |
| 2271 memset( geneMaxFPKMTag, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ; | |
| 2272 memset( nonOverlapMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; | |
| 2273 | |
| 2274 double *geneMaxCov = new double[ usedGeneId - baseGeneId ] ; | |
| 2275 memset( geneMaxCov, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; | |
| 2276 int *txptGid = new int[tcnt] ; | |
| 2277 | |
| 2278 /*for ( i = 0 ; i < tcnt ; ++i ) | |
| 2279 { | |
| 2280 printf( "%d: %lf ", i, transcripts[i].FPKM ) ; | |
| 2281 transcripts[i].seVector.Print() ; | |
| 2282 }*/ | |
| 2283 | |
| 2284 /*================================================================== | |
| 2285 Remove transcripts that has too few relative FPKM. (-f) | |
| 2286 ====================================================================*/ | |
| 2287 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2288 { | |
| 2289 int gid = GetTranscriptGeneId( transcripts[i], subexons ) ; | |
| 2290 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; | |
| 2291 //printf( "gid=%d\n", gid ) ; | |
| 2292 //printf( "%lf %lf %d\n", transcripts[i].abundance, transcripts[i].FPKM, len ) ; | |
| 2293 if ( transcripts[i].FPKM > geneMaxFPKM[gid - baseGeneId ] ) | |
| 2294 { | |
| 2295 geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].FPKM ; | |
| 2296 geneMaxFPKMTag[ gid - baseGeneId ] = i ; | |
| 2297 } | |
| 2298 if ( transcripts[i].abundance * alignments.readLen / len > geneMaxCov[gid - baseGeneId ] ) | |
| 2299 geneMaxCov[gid - baseGeneId] = ( transcripts[i].abundance * alignments.readLen ) / len ; | |
| 2300 txptGid[i] = gid ; | |
| 2301 } | |
| 2302 | |
| 2303 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2304 { | |
| 2305 int tag = txptGid[i] - baseGeneId ; | |
| 2306 if ( ( transcripts[i].last < transcripts[ geneMaxFPKMTag[ tag ] ].first | |
| 2307 || transcripts[i].first > transcripts[ geneMaxFPKMTag[tag] ].last ) && transcripts[i].FPKM > nonOverlapMaxFPKM[tag] ) | |
| 2308 nonOverlapMaxFPKM[tag] = transcripts[i].FPKM ; | |
| 2309 } | |
| 2310 BitTable bufferTable ; | |
| 2311 bufferTable.Duplicate( transcripts[0].seVector ) ; | |
| 2312 | |
| 2313 if ( !aggressive ) | |
| 2314 { | |
| 2315 // Rescue the transcripts covering unique constraints. | |
| 2316 int cnt = 0 ; | |
| 2317 int tag = 0 ; | |
| 2318 int *uniqCount = new int[tcnt] ; | |
| 2319 memset( uniqCount, 0, sizeof( int ) * tcnt ) ; | |
| 2320 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2321 { | |
| 2322 cnt = 0 ; | |
| 2323 if ( tc[j].uniqSupport <= 5 ) | |
| 2324 continue ; | |
| 2325 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2326 { | |
| 2327 if ( IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) && | |
| 2328 IsConstraintInTranscript( transcripts[i], scc[ tc[j].j] ) ) | |
| 2329 { | |
| 2330 tag = i ; | |
| 2331 ++cnt ; | |
| 2332 } | |
| 2333 if ( cnt >= 2 ) | |
| 2334 break ; | |
| 2335 } | |
| 2336 if ( cnt == 1 ) | |
| 2337 { | |
| 2338 ++uniqCount[tag] ; | |
| 2339 } | |
| 2340 } | |
| 2341 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2342 { | |
| 2343 if ( uniqCount[i] >= 2 ) | |
| 2344 { | |
| 2345 transcripts[i].abundance *= 4 ; | |
| 2346 transcripts[i].FPKM *= 4 ; | |
| 2347 } | |
| 2348 } | |
| 2349 | |
| 2350 delete[] uniqCount ; | |
| 2351 } | |
| 2352 | |
| 2353 int sccCnt = scc.size() ; | |
| 2354 double filterFactor = 1.0 ; | |
| 2355 | |
| 2356 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2357 { | |
| 2358 //printf( "%d: %lf %lf\n", txptGid[i], transcripts[i].abundance, geneMaxFPKM[ txptGid[i] - baseGeneId ] ) ; | |
| 2359 | |
| 2360 if ( transcripts[i].FPKM < filterFactor * FPKMFraction * geneMaxFPKM[ txptGid[i] - baseGeneId ] ) | |
| 2361 { | |
| 2362 /*int cnt = 0 ; | |
| 2363 int coverCnt = 0 ; | |
| 2364 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2365 { | |
| 2366 if ( transcripts[i].constraintsSupport[j] > 0 ) | |
| 2367 ++coverCnt ; | |
| 2368 double factor = tc[j].effectiveCount ; | |
| 2369 if ( transcripts[i].constraintsSupport[j] >= factor * tc[j].support - 1e-3 | |
| 2370 && tc[j].support >= 10 | |
| 2371 && tc[j].uniqSupport >= 0.95 * tc[j].support ) | |
| 2372 { | |
| 2373 ++cnt ; | |
| 2374 } | |
| 2375 } | |
| 2376 //cnt = 0 ; | |
| 2377 if ( cnt >= 2 ) | |
| 2378 { | |
| 2379 ; | |
| 2380 } | |
| 2381 else*/ | |
| 2382 transcripts[i].abundance = -transcripts[i].abundance ; | |
| 2383 } | |
| 2384 //if ( transcripts[i].FPKM >= 0.8 * geneMaxFPKM[ txptGid[i] - baseGeneId ] && geneMaxCov[ txptGid[i] - baseGeneId ] >= txptMinReadDepth ) | |
| 2385 // continue ; | |
| 2386 } | |
| 2387 | |
| 2388 if ( nonOverlapMaxFPKM != 0 ) | |
| 2389 { | |
| 2390 // Go two iterations to rescue, the first iteration should be just for marking. | |
| 2391 std::vector<int> rescueList ; | |
| 2392 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2393 { | |
| 2394 if ( transcripts[i].abundance >= 0 ) | |
| 2395 continue ; | |
| 2396 | |
| 2397 for ( j = 0 ; j < tcnt ; ++j ) | |
| 2398 { | |
| 2399 if ( transcripts[j].abundance < 0 || txptGid[i] != txptGid[j] ) | |
| 2400 continue ; | |
| 2401 if ( transcripts[i].first <= transcripts[j].last && transcripts[i].last >= transcripts[j].first ) | |
| 2402 /*bufferTable.Assign( transcripts[i].seVector ) ; | |
| 2403 bufferTable.And( transcripts[j].seVector ) ; | |
| 2404 | |
| 2405 if ( !bufferTable.IsAllZero() )*/ | |
| 2406 break ; | |
| 2407 } | |
| 2408 if ( j >= tcnt && transcripts[i].FPKM >= FPKMFraction * nonOverlapMaxFPKM[ txptGid[i] - baseGeneId ] ) | |
| 2409 { | |
| 2410 //transcripts[i].abundance = -transcripts[i].abundance ; | |
| 2411 rescueList.push_back( i ) ; | |
| 2412 } | |
| 2413 } | |
| 2414 | |
| 2415 int size = rescueList.size() ; | |
| 2416 for ( i = 0 ; i < size ; ++i ) | |
| 2417 transcripts[ rescueList[i] ].abundance *= -1 ; | |
| 2418 } | |
| 2419 | |
| 2420 /*================================================================== | |
| 2421 Remove transcripts that has too few read coverage (-d) | |
| 2422 ====================================================================*/ | |
| 2423 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2424 { | |
| 2425 if ( transcripts[i].abundance >= 0 ) | |
| 2426 { | |
| 2427 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; | |
| 2428 double cov = ( transcripts[i].abundance * alignments.readLen ) / len ; | |
| 2429 //printf( "%d: %d %d %lf %lf\n", i, len, transcripts[i].seVector.Count(), cov, geneMaxCov[ txptGid[i] - baseGeneId ] ) ; | |
| 2430 | |
| 2431 if ( ( tcnt > 1 || len <= 1000 || transcripts[i].seVector.Count() <= 3 ) && cov < txptMinReadDepth ) | |
| 2432 { | |
| 2433 //if ( usedGeneId == baseGeneId + 1 && /*transcripts[i].seVector.Count() > 3 | |
| 2434 // && len > 1000 &&*/ geneMaxCov[ txptGid[i] - baseGeneId ] == cov ) | |
| 2435 if ( geneMaxCov[ txptGid[i] - baseGeneId ] == cov ) | |
| 2436 continue ; | |
| 2437 | |
| 2438 // Test whether it has some very abundant constraints. | |
| 2439 /*int cnt = 0 ; | |
| 2440 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2441 { | |
| 2442 if ( transcripts[i].constraintsSupport[j] >= tc[j].support / 2.0 | |
| 2443 && tc[j].support >= 10 | |
| 2444 && tc[j].uniqSupport >= 0.95 * tc[j].support | |
| 2445 && tc[j].normAbund >= 1 ) | |
| 2446 { | |
| 2447 ++cnt ; | |
| 2448 } | |
| 2449 } | |
| 2450 | |
| 2451 if ( cnt >= 1 ) | |
| 2452 { | |
| 2453 continue ; | |
| 2454 }*/ | |
| 2455 | |
| 2456 // Test whether this transcript is fully covered. If so ,we can filter it. | |
| 2457 | |
| 2458 if ( geneMaxCov[ txptGid[i] - baseGeneId ] <= 5 ) | |
| 2459 { | |
| 2460 bufferTable.Reset() ; | |
| 2461 for ( j = 0 ; j < sccCnt ; ++j ) | |
| 2462 { | |
| 2463 if ( !IsConstraintInTranscript( transcripts[i], scc[j] ) ) | |
| 2464 continue ; | |
| 2465 bufferTable.Or( scc[j].vector ) ; | |
| 2466 } | |
| 2467 if ( bufferTable.IsEqual( transcripts[i].seVector ) ) | |
| 2468 transcripts[i].abundance = -transcripts[i].abundance ; | |
| 2469 } | |
| 2470 else | |
| 2471 transcripts[i].abundance = -transcripts[i].abundance ; | |
| 2472 | |
| 2473 /*else | |
| 2474 { | |
| 2475 transcripts[i].seVector.Print() ; | |
| 2476 bufferTable.Print() ; | |
| 2477 OutputTranscript( stderr, subexons, transcripts[i] ) ; | |
| 2478 }*/ | |
| 2479 } | |
| 2480 } | |
| 2481 } | |
| 2482 | |
| 2483 /*================================================================== | |
| 2484 Remove transcripts that is too short | |
| 2485 ====================================================================*/ | |
| 2486 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2487 { | |
| 2488 if ( transcripts[i].abundance <= 0 ) | |
| 2489 continue ; | |
| 2490 | |
| 2491 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; | |
| 2492 if ( len < 200 ) | |
| 2493 { | |
| 2494 transcripts[i].abundance = -transcripts[i].abundance ; | |
| 2495 } | |
| 2496 } | |
| 2497 | |
| 2498 // Rescue transcripts that showed up in many samples. | |
| 2499 /*for ( i = 0 ; i < tcnt ; ++i ) | |
| 2500 { | |
| 2501 if ( transcripts[i].abundance > 0 ) | |
| 2502 continue ; | |
| 2503 if ( txptSampleSupport[ transcripts[i].id ] >= 3 && | |
| 2504 txptSampleSupport[transcripts[i].id ] >= (int)( sampleCnt / 2 ) ) | |
| 2505 transcripts[i].abundance = -transcripts[i].abundance ; | |
| 2506 }*/ | |
| 2507 | |
| 2508 // Rescue some transcripts covering subexon chains showed up in many samples, but missing after filtration. | |
| 2509 struct _constraint tmpC ; | |
| 2510 tmpC.vector.Init( seCnt ) ; | |
| 2511 | |
| 2512 std::vector< struct _pair32 > missingChain ; | |
| 2513 std::vector<int> recoverCandidate ; | |
| 2514 bool *used = new bool[tcnt] ; | |
| 2515 memset( used, false, sizeof( bool ) * tcnt ) ; | |
| 2516 | |
| 2517 // Obtain the list of transcripts that should be recovered. | |
| 2518 for ( i = 0 ; i < seCnt && sampleCnt > 1 ; ++i ) | |
| 2519 { | |
| 2520 | |
| 2521 double maxFPKM = -1 ; | |
| 2522 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; | |
| 2523 it != subexonChainSupport[i].end() ; ++it ) | |
| 2524 { | |
| 2525 if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.5 * sampleCnt ) ) && it->second <= sampleCnt / 2 ) | |
| 2526 continue ; | |
| 2527 | |
| 2528 bool recover = true ; | |
| 2529 tmpC.vector.Reset() ; | |
| 2530 tmpC.vector.Set( i ) ; | |
| 2531 tmpC.vector.Set( it->first ) ; | |
| 2532 tmpC.first = i ; | |
| 2533 tmpC.last = it->first ; | |
| 2534 | |
| 2535 | |
| 2536 for ( j = 0 ; j < tcnt ; ++j ) | |
| 2537 { | |
| 2538 if ( transcripts[j].abundance < 0 ) | |
| 2539 continue ; | |
| 2540 | |
| 2541 if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) | |
| 2542 { | |
| 2543 recover = false ; | |
| 2544 break ; | |
| 2545 } | |
| 2546 | |
| 2547 if ( recover ) | |
| 2548 { | |
| 2549 for ( j = 0 ; j < tcnt ; ++j ) | |
| 2550 { | |
| 2551 if ( transcripts[j].abundance > 0 ) | |
| 2552 continue ; | |
| 2553 //printf( "%d %lf\n", IsConstraintInTranscript( transcripts[j], tmpC ), transcripts[j].FPKM ) ; | |
| 2554 if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) | |
| 2555 { | |
| 2556 /*if ( maxTag == -1 ) | |
| 2557 maxTag = j ; | |
| 2558 else | |
| 2559 { | |
| 2560 if ( txptSampleSupport[ transcripts[j].id ] > txptSampleSupport[ transcripts[maxTag ].id ] ) | |
| 2561 maxTag = j ; | |
| 2562 else if ( txptSampleSupport[ transcripts[j].id ] == txptSampleSupport[ transcripts[maxTag ].id ]) | |
| 2563 { | |
| 2564 if ( transcripts[j].FPKM > transcripts[maxTag].FPKM ) | |
| 2565 maxTag = j ; | |
| 2566 } | |
| 2567 }*/ | |
| 2568 | |
| 2569 struct _pair32 np ; | |
| 2570 np.a = i ; np.b = it->first ; | |
| 2571 missingChain.push_back( np ) ; | |
| 2572 | |
| 2573 if ( !used[j] ) | |
| 2574 { | |
| 2575 recoverCandidate.push_back( j ) ; | |
| 2576 used[j] = true ; | |
| 2577 } | |
| 2578 } | |
| 2579 } | |
| 2580 | |
| 2581 /*if ( maxTag != -1 && txptSampleSupport[ transcripts[maxTag].id ] > 1 ) | |
| 2582 { | |
| 2583 //printf( "recover %d %d\n", maxTag, txptSampleSupport[ transcripts[maxTag].id ] ) ; | |
| 2584 transcripts[maxTag].abundance *= -1 ; | |
| 2585 }*/ | |
| 2586 } | |
| 2587 } | |
| 2588 | |
| 2589 } | |
| 2590 } | |
| 2591 | |
| 2592 int size = recoverCandidate.size() ; | |
| 2593 memset( used, false, sizeof( bool ) * tcnt ) ; | |
| 2594 // Recover the candidates in the order of reliability | |
| 2595 int *geneRecoverCnt = new int[ usedGeneId - baseGeneId ] ; | |
| 2596 memset( geneRecoverCnt, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ; | |
| 2597 int round = 1 ; | |
| 2598 if ( aggressive && size > 1 ) | |
| 2599 round = 1 ; | |
| 2600 | |
| 2601 for ( i = 0 ; i < size ; ++i ) | |
| 2602 { | |
| 2603 int maxTag = -1 ; | |
| 2604 int maxCover = -1 ; | |
| 2605 for ( j = 0 ; j < size ; ++j ) | |
| 2606 { | |
| 2607 if ( !used[ recoverCandidate[j] ] ) | |
| 2608 { | |
| 2609 /*int cover = 0 ; | |
| 2610 | |
| 2611 k = missingChain.size() ; | |
| 2612 int l ; | |
| 2613 for ( l = 0 ; l < k ; ++l ) | |
| 2614 { | |
| 2615 if ( missingChain[l].a == -1 ) | |
| 2616 continue ; | |
| 2617 | |
| 2618 tmpC.vector.Reset() ; | |
| 2619 tmpC.vector.Set( missingChain[l].a ) ; | |
| 2620 tmpC.vector.Set( missingChain[l].b ) ; | |
| 2621 tmpC.first = missingChain[l].a ; | |
| 2622 tmpC.last = missingChain[l].b ; | |
| 2623 | |
| 2624 if ( IsConstraintInTranscript( transcripts[ recoverCandidate[j] ], tmpC ) ) | |
| 2625 { | |
| 2626 ++cover ; | |
| 2627 } | |
| 2628 }*/ | |
| 2629 | |
| 2630 if ( maxTag == -1 ) | |
| 2631 { | |
| 2632 maxTag = recoverCandidate[j] ; | |
| 2633 //maxCover = cover ; | |
| 2634 continue ; | |
| 2635 } | |
| 2636 | |
| 2637 /*if ( cover > maxCover ) | |
| 2638 { | |
| 2639 maxTag = recoverCandidate[j] ; | |
| 2640 maxCover = cover ; | |
| 2641 } | |
| 2642 else if ( cover == maxCover ) | |
| 2643 {*/ | |
| 2644 if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] > | |
| 2645 txptSampleSupport[ | |
| 2646 transcripts[ maxTag ].id | |
| 2647 ] ) | |
| 2648 maxTag = recoverCandidate[j] ; | |
| 2649 else if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] == | |
| 2650 txptSampleSupport[ transcripts[ maxTag ].id ] ) | |
| 2651 { | |
| 2652 if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM ) | |
| 2653 maxTag = recoverCandidate[j] ; | |
| 2654 } | |
| 2655 | |
| 2656 /*else if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM ) | |
| 2657 maxTag = recoverCandidate[j] ; | |
| 2658 else if ( transcripts[ recoverCandidate[j] ].FPKM == transcripts[ maxTag ].FPKM ) | |
| 2659 { | |
| 2660 if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] > | |
| 2661 txptSampleSupport[ transcripts[ maxTag ].id ] ) | |
| 2662 maxTag = recoverCandidate[j] ; | |
| 2663 }*/ | |
| 2664 //} | |
| 2665 } | |
| 2666 } | |
| 2667 | |
| 2668 if ( maxTag == -1 || txptSampleSupport[ transcripts[ maxTag ].id ] <= 2 | |
| 2669 || txptSampleSupport[ transcripts[maxTag].id ] < 0.5 * sampleCnt ) | |
| 2670 break ; | |
| 2671 | |
| 2672 used[maxTag] = true ; | |
| 2673 if ( geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] >= round ) | |
| 2674 continue ; | |
| 2675 ++geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] ; | |
| 2676 | |
| 2677 k = missingChain.size() ; | |
| 2678 int cnt = 0 ; | |
| 2679 for ( j = 0 ; j < k ; ++j ) | |
| 2680 { | |
| 2681 if ( missingChain[j].a == -1 ) | |
| 2682 continue ; | |
| 2683 | |
| 2684 tmpC.vector.Reset() ; | |
| 2685 tmpC.vector.Set( missingChain[j].a ) ; | |
| 2686 tmpC.vector.Set( missingChain[j].b ) ; | |
| 2687 tmpC.first = missingChain[j].a ; | |
| 2688 tmpC.last = missingChain[j].b ; | |
| 2689 | |
| 2690 if ( IsConstraintInTranscript( transcripts[maxTag], tmpC ) ) | |
| 2691 { | |
| 2692 missingChain[j].a = -1 ; | |
| 2693 ++cnt ; | |
| 2694 } | |
| 2695 } | |
| 2696 | |
| 2697 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[maxTag].abundance, transcripts[maxTag].FPKM ) ; | |
| 2698 double cov = ( transcripts[maxTag].abundance * alignments.readLen ) / len ; | |
| 2699 if ( cnt >= 1 && cov > 1.0 ) | |
| 2700 { | |
| 2701 transcripts[maxTag].abundance *= -1 ; | |
| 2702 } | |
| 2703 } | |
| 2704 delete[] used ; | |
| 2705 delete[] geneRecoverCnt ; | |
| 2706 tmpC.vector.Release() ; | |
| 2707 | |
| 2708 | |
| 2709 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; | |
| 2710 | |
| 2711 | |
| 2712 | |
| 2713 delete []geneMaxCov ; | |
| 2714 bufferTable.Release() ; | |
| 2715 delete []geneMaxFPKM ; | |
| 2716 delete []geneMaxFPKMTag ; | |
| 2717 delete []nonOverlapMaxFPKM ; | |
| 2718 delete []txptGid ; | |
| 2719 | |
| 2720 /*================================================================== | |
| 2721 Remove transcripts that seems duplicated | |
| 2722 ====================================================================*/ | |
| 2723 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2724 { | |
| 2725 int support = 0 ; | |
| 2726 int uniqSupport = 0 ; | |
| 2727 | |
| 2728 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2729 { | |
| 2730 if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) ) | |
| 2731 continue ; | |
| 2732 //support += scc[ tc[j].i ].support + scc[ tc[j].j ].support ; | |
| 2733 //uniqSupport += scc[ tc[j].i ].uniqSupport + scc[ tc[j].j ].uniqSupport ; | |
| 2734 support += tc[j].support ; | |
| 2735 uniqSupport += tc[j].uniqSupport ; | |
| 2736 | |
| 2737 //printf( "constraint uniqness: %d: %d %d\n", i, tc[j].uniqSupport, tc[j].support ) ; | |
| 2738 } | |
| 2739 //printf( "%d: %d %d\n", i, uniqSupport, support ) ; | |
| 2740 if ( (double)uniqSupport < 0.03 * support ) | |
| 2741 transcripts[i].abundance = -1 ; | |
| 2742 } | |
| 2743 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; | |
| 2744 | |
| 2745 | |
| 2746 /*================================================================== | |
| 2747 Remove shadow transcripts, the abnormal 2-exon txpt whose intron is very close to the true one or one of the anchor exon is shorter than 25bp.... | |
| 2748 ====================================================================*/ | |
| 2749 int minusCnt = 0, plusCnt = 0 ; | |
| 2750 int mainStrand ; | |
| 2751 for ( i = 0 ; i < seCnt ; ++i ) | |
| 2752 { | |
| 2753 if ( subexons[i].rightStrand == 1 ) | |
| 2754 ++plusCnt ; | |
| 2755 else if ( subexons[i].rightStrand == -1 ) | |
| 2756 ++minusCnt ; | |
| 2757 } | |
| 2758 if ( plusCnt > minusCnt ) | |
| 2759 mainStrand = 1 ; | |
| 2760 else | |
| 2761 mainStrand = -1 ; | |
| 2762 | |
| 2763 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2764 { | |
| 2765 std::vector<int> subexonIdx ; | |
| 2766 transcripts[i].seVector.GetOnesIndices( subexonIdx ) ; | |
| 2767 int size = subexonIdx.size() ; | |
| 2768 int intronCnt = 0 ; | |
| 2769 int anchorIdx = 0 ; // the subexon adjacent to the only intron. | |
| 2770 | |
| 2771 for ( j = 0 ; j < size - 1 ; ++j ) | |
| 2772 { | |
| 2773 if ( subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start ) | |
| 2774 { | |
| 2775 ++intronCnt ; | |
| 2776 anchorIdx = j ; | |
| 2777 } | |
| 2778 } | |
| 2779 if ( intronCnt != 1 ) | |
| 2780 continue ; | |
| 2781 | |
| 2782 int anchorExonLength[2] = {0, 0}; | |
| 2783 int tag = 0 ; | |
| 2784 for ( j = 0 ; j < size ; ++j ) | |
| 2785 { | |
| 2786 anchorExonLength[tag] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; | |
| 2787 if ( tag == 0 && subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start ) | |
| 2788 ++tag ; | |
| 2789 } | |
| 2790 | |
| 2791 int flag = 0 ; | |
| 2792 if ( subexons[ subexonIdx[anchorIdx] ].rightStrand == mainStrand ) | |
| 2793 { | |
| 2794 j = subexonIdx[ anchorIdx ] ; | |
| 2795 if ( subexons[j].end - subexons[j].start + 1 <= 20 || | |
| 2796 ( subexons[j+ 1].start == subexons[j].end + 1 && subexons[j + 1].end - subexons[j + 1].start + 1 <= 20 | |
| 2797 && subexons[j + 1].rightStrand == mainStrand ) ) | |
| 2798 ++flag ; | |
| 2799 j = subexonIdx[ anchorIdx + 1 ] ; | |
| 2800 if ( subexons[j].end - subexons[j].start + 1 <= 20 || | |
| 2801 ( subexons[j].start == subexons[j - 1].end + 1 && subexons[j - 1].end - subexons[j - 1].start + 1 <= 20 | |
| 2802 && subexons[j - 1].leftStrand == mainStrand ) ) | |
| 2803 ++flag ; | |
| 2804 } | |
| 2805 | |
| 2806 if ( anchorExonLength[0] <= 25 || anchorExonLength[1] <= 25 ) | |
| 2807 flag = 2 ; | |
| 2808 | |
| 2809 // the alignment support the intron must be unique and has enough support. | |
| 2810 int support = 0 ; | |
| 2811 int uniqSupport = 0 ; | |
| 2812 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 2813 { | |
| 2814 if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) ) | |
| 2815 continue ; | |
| 2816 if ( ( scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) | |
| 2817 || ( scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) ) | |
| 2818 { | |
| 2819 support += tc[j].support ; | |
| 2820 uniqSupport += tc[j].uniqSupport ; | |
| 2821 } | |
| 2822 | |
| 2823 } | |
| 2824 | |
| 2825 if ( (double)uniqSupport < 0.3 * support || support < txptMinReadDepth ) | |
| 2826 { | |
| 2827 flag = 2 ; | |
| 2828 } | |
| 2829 | |
| 2830 if ( flag == 2 ) | |
| 2831 transcripts[i].abundance = -1 ; | |
| 2832 | |
| 2833 } | |
| 2834 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; | |
| 2835 | |
| 2836 return transcripts.size() ; | |
| 2837 } | |
| 2838 | |
| 2839 void TranscriptDecider::ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts ) | |
| 2840 { | |
| 2841 int i, j ; | |
| 2842 int tcnt = transcripts.size() ; | |
| 2843 struct _constraint tmpC ; | |
| 2844 tmpC.vector.Init( seCnt ) ; | |
| 2845 | |
| 2846 for ( i = 0 ; i < tcnt ; ++i ) | |
| 2847 transcripts[i].correlationScore = 0 ; | |
| 2848 | |
| 2849 for ( i = 0 ; i < seCnt ; ++i ) | |
| 2850 { | |
| 2851 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; | |
| 2852 it != subexonChainSupport[i].end() ; ++it ) | |
| 2853 { | |
| 2854 if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.1 * sampleCnt ) ) && it->second <= sampleCnt / 2 ) | |
| 2855 continue ; | |
| 2856 | |
| 2857 tmpC.vector.Reset() ; | |
| 2858 tmpC.vector.Set( i ) ; | |
| 2859 tmpC.vector.Set( it->first ) ; | |
| 2860 tmpC.first = i ; | |
| 2861 tmpC.last = it->first ; | |
| 2862 | |
| 2863 for ( j = 0 ; j < tcnt ; ++j ) | |
| 2864 { | |
| 2865 if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) | |
| 2866 ++transcripts[j].correlationScore ; | |
| 2867 } | |
| 2868 } | |
| 2869 } | |
| 2870 | |
| 2871 tmpC.vector.Release() ; | |
| 2872 } | |
| 2873 | |
| 2874 int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation ) | |
| 2875 { | |
| 2876 int i, j, k ; | |
| 2877 int cnt = 0 ; | |
| 2878 int *f = new int[seCnt] ; // this is a general buffer for a type of usage. | |
| 2879 bool useDP = false ; | |
| 2880 | |
| 2881 compatibleTestVectorT.Init( seCnt ) ; // this is the bittable used in compatible test function. | |
| 2882 compatibleTestVectorC.Init( seCnt ) ; | |
| 2883 | |
| 2884 for ( i = 0 ; i < seCnt ; ++i ) | |
| 2885 { | |
| 2886 subexons[i].canBeStart = subexons[i].canBeEnd = false ; | |
| 2887 | |
| 2888 if ( subexons[i].prevCnt == 0 ) | |
| 2889 subexons[i].canBeStart = true ; | |
| 2890 else if ( subexons[i].leftClassifier < canBeSoftBoundaryThreshold && subexons[i].leftClassifier != -1 | |
| 2891 && subexons[i].leftStrand != 0 ) // The case of overhang. | |
| 2892 { | |
| 2893 // We then look into whether there is a left-side end already showed up before this subexon in this region of subexons. | |
| 2894 bool flag = true ; | |
| 2895 for ( j = i - 1 ; j >= 0 ; --j ) | |
| 2896 { | |
| 2897 if ( subexons[j].end + 1 != subexons[j + 1].start ) | |
| 2898 break ; | |
| 2899 if ( subexons[i].canBeStart == true ) | |
| 2900 { | |
| 2901 flag = false ; | |
| 2902 break ; | |
| 2903 } | |
| 2904 } | |
| 2905 subexons[i].canBeStart = flag ; | |
| 2906 } | |
| 2907 | |
| 2908 if ( subexons[i].nextCnt == 0 ) | |
| 2909 subexons[i].canBeEnd = true ; | |
| 2910 else if ( subexons[i].rightClassifier < canBeSoftBoundaryThreshold && subexons[i].rightClassifier != -1 | |
| 2911 && subexons[i].rightStrand != 0 ) | |
| 2912 { | |
| 2913 subexons[i].canBeEnd = true ; | |
| 2914 } | |
| 2915 // Remove other soft end already showed up in this region of subexons. | |
| 2916 if ( subexons[i].canBeEnd == true ) | |
| 2917 { | |
| 2918 for ( j = i - 1 ; j >= 0 ; --j ) | |
| 2919 { | |
| 2920 if ( subexons[j].end + 1 != subexons[j + 1].start ) | |
| 2921 break ; | |
| 2922 if ( subexons[j].canBeEnd == true ) | |
| 2923 { | |
| 2924 subexons[j].canBeEnd = false ; | |
| 2925 break ; | |
| 2926 } | |
| 2927 } | |
| 2928 } | |
| 2929 //printf( "%d: %d %lf\n", subexons[i].canBeStart, subexons[i].prevCnt, subexons[i].leftClassifier ) ; | |
| 2930 } | |
| 2931 | |
| 2932 // Go through the cases of mixture region to set canBeStart/End. | |
| 2933 // e.g: +[...]+_____+[....]-...]+____+[..)_____-[...]- | |
| 2934 // ^ then we need to force a start point here. | |
| 2935 // Do we need to associate a strand information with canBeStart, canBeEnd? | |
| 2936 for ( i = 0 ; i < seCnt ; ) | |
| 2937 { | |
| 2938 // [i, j) is a region. | |
| 2939 for ( j = i + 1 ; j < seCnt ; ++j ) | |
| 2940 if ( subexons[j].start > subexons[j - 1].end + 1 ) | |
| 2941 break ; | |
| 2942 if ( subexons[i].canBeStart == false ) // then subexons[i] must has a hard left boundary. | |
| 2943 { | |
| 2944 int leftStrandCnt[2] = {0, 0} ; | |
| 2945 for ( k = i ; k < j ; ++k ) | |
| 2946 { | |
| 2947 if ( !SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) ) | |
| 2948 break ; | |
| 2949 if ( subexons[k].leftStrand != 0 ) | |
| 2950 ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ; | |
| 2951 } | |
| 2952 if ( k < j && leftStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] == 0 ) | |
| 2953 subexons[i].canBeStart = true ; | |
| 2954 } | |
| 2955 | |
| 2956 if ( subexons[j - 1].canBeEnd == false ) | |
| 2957 { | |
| 2958 int rightStrandCnt[2] = {0, 0} ; | |
| 2959 for ( k = j - 1 ; k >= i ; --k ) | |
| 2960 { | |
| 2961 if ( !SubexonGraph::IsSameStrand( subexons[k].leftStrand, subexons[j - 1].rightStrand ) ) | |
| 2962 break ; | |
| 2963 if ( subexons[k].rightStrand != 0 ) | |
| 2964 ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ; | |
| 2965 } | |
| 2966 if ( k >= i && rightStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] == 0 ) | |
| 2967 subexons[j - 1].canBeEnd = true ; | |
| 2968 } | |
| 2969 | |
| 2970 //if ( subexons[i].start == 6870264) | |
| 2971 // printf( "hi %d %d\n",i , subexons[i].canBeStart ) ; | |
| 2972 i = j ; | |
| 2973 } | |
| 2974 /*for ( i = 0 ; i < seCnt ; ++i ) | |
| 2975 { | |
| 2976 printf( "%d %d: %d %d\n", subexons[i].start, subexons[i].end, subexons[i].canBeStart, subexons[i].canBeEnd ) ; | |
| 2977 }*/ | |
| 2978 | |
| 2979 // Find the gene ids. | |
| 2980 baseGeneId = subexons[0].lcCnt ; | |
| 2981 usedGeneId = subexons[0].rcCnt ; | |
| 2982 defaultGeneId[0] = defaultGeneId[1] = -1 ; | |
| 2983 for ( i = 0 ; i < seCnt ; ++i ) | |
| 2984 { | |
| 2985 if ( subexons[i].geneId < 0 ) | |
| 2986 continue ; | |
| 2987 | |
| 2988 //if ( baseGeneId == -1 || subexons[i].geneId < baseGeneId ) | |
| 2989 // baseGeneId = subexons[i].geneId ; | |
| 2990 //if ( subexons[i].geneId > usedGeneId ) | |
| 2991 // usedGeneId = subexons[i].geneId ; | |
| 2992 | |
| 2993 if ( ( subexons[i].rightStrand == -1 || subexons[i].leftStrand == -1 ) && | |
| 2994 ( defaultGeneId[0] == -1 || subexons[i].geneId < defaultGeneId[0] ) ) | |
| 2995 defaultGeneId[0] = subexons[i].geneId ; | |
| 2996 if ( ( subexons[i].rightStrand == 1 || subexons[i].leftStrand == 1 ) && | |
| 2997 ( defaultGeneId[1] == -1 || subexons[i].geneId < defaultGeneId[1] ) ) | |
| 2998 defaultGeneId[1] = subexons[i].geneId ; | |
| 2999 } | |
| 3000 if ( defaultGeneId[0] == -1 ) | |
| 3001 defaultGeneId[0] = baseGeneId ; | |
| 3002 if ( defaultGeneId[1] == -1 ) | |
| 3003 defaultGeneId[1] = usedGeneId - 1 ; | |
| 3004 | |
| 3005 // Go through the constraints to find the chain of subexons that should be kept. | |
| 3006 std::map<int, int> *subexonChainSupport = new std::map<int, int>[ seCnt ] ; | |
| 3007 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3008 { | |
| 3009 std::vector<int> subexonIdx ; | |
| 3010 std::vector<struct _pair32> chain ; | |
| 3011 | |
| 3012 int tcCnt = constraints[i].constraints.size() ; | |
| 3013 int size ; | |
| 3014 for ( j = 0 ; j < tcCnt ; ++j ) | |
| 3015 { | |
| 3016 struct _constraint c = constraints[i].constraints[j] ; | |
| 3017 if ( c.uniqSupport < 0.95 * c.support || c.support < 3 ) | |
| 3018 continue ; | |
| 3019 | |
| 3020 subexonIdx.clear() ; | |
| 3021 c.vector.GetOnesIndices( subexonIdx ) ; | |
| 3022 size = subexonIdx.size() ; | |
| 3023 | |
| 3024 for ( k = 0 ; k < size - 1 ; ++k ) | |
| 3025 { | |
| 3026 struct _pair32 p ; | |
| 3027 | |
| 3028 p.a = subexonIdx[k] ; | |
| 3029 p.b = subexonIdx[k + 1] ; | |
| 3030 //if ( subexons[p.a].end + 1 == 113235898 && subexons[ p.b ].start + 1 == 113236121 ) | |
| 3031 // printf( "bad bad %d %d %d\n", i, c.uniqSupport, c.support ) ; | |
| 3032 | |
| 3033 if ( subexons[ p.a ].end + 1 < subexons[ p.b ].start ) | |
| 3034 chain.push_back( p ) ; | |
| 3035 } | |
| 3036 } | |
| 3037 // Remove redundancy. | |
| 3038 sort( chain.begin(), chain.end(), CompSortPairs ) ; | |
| 3039 size = chain.size() ; | |
| 3040 k = 0 ; | |
| 3041 for ( j = 1 ; j < size ; ++j ) | |
| 3042 { | |
| 3043 if ( chain[j].a == chain[k].a && chain[j].b == chain[k].b ) | |
| 3044 continue ; | |
| 3045 else | |
| 3046 { | |
| 3047 ++k ; | |
| 3048 chain[k] = chain[j] ; | |
| 3049 } | |
| 3050 } | |
| 3051 chain.resize( k + 1 ) ; | |
| 3052 | |
| 3053 // Add those to sample count | |
| 3054 size = k + 1 ; | |
| 3055 for ( j = 0 ; j < size ; ++j ) | |
| 3056 { | |
| 3057 if ( subexonChainSupport[ chain[j].a ].count( chain[j].b ) ) | |
| 3058 { | |
| 3059 ++subexonChainSupport[ chain[j].a ][ chain[j].b ] ; | |
| 3060 } | |
| 3061 else | |
| 3062 subexonChainSupport[ chain[j].a ][ chain[j].b ] = 1 ; | |
| 3063 } | |
| 3064 } | |
| 3065 | |
| 3066 /*for ( i = 0 ; i < seCnt ; ++i ) | |
| 3067 { | |
| 3068 printf( "%d:", i ) ; | |
| 3069 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; it != subexonChainSupport[i].end() ; ++it ) | |
| 3070 printf( " (%d %d) ", it->first, it->second ) ; | |
| 3071 printf( "\n" ) ; | |
| 3072 }*/ | |
| 3073 | |
| 3074 //printf( "%d %d %d\n", defaultGeneId[0], baseGeneId, usedGeneId ) ; | |
| 3075 cnt = 0 ; | |
| 3076 memset( f, -1, sizeof( int ) * seCnt ) ; | |
| 3077 for ( i = 0 ; i < seCnt ; ++i ) | |
| 3078 { | |
| 3079 if ( subexons[i].canBeStart ) | |
| 3080 { | |
| 3081 cnt += SubTranscriptCount( i, subexons, f ) ; | |
| 3082 } | |
| 3083 } | |
| 3084 if ( cnt <= USE_DP ) | |
| 3085 { | |
| 3086 for ( i = 0 ; i < seCnt ; ++i ) | |
| 3087 if ( f[i] > USE_DP ) | |
| 3088 { | |
| 3089 useDP = true ; | |
| 3090 break ; | |
| 3091 } | |
| 3092 } | |
| 3093 else | |
| 3094 useDP = true ; | |
| 3095 if ( !useDP ) | |
| 3096 { | |
| 3097 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3098 { | |
| 3099 double msize = constraints[i].matePairs.size() ; | |
| 3100 double csize = constraints[i].constraints.size() ; | |
| 3101 if ( cnt > ( csize / msize ) * ( csize / msize ) * seCnt | |
| 3102 && cnt > USE_DP / ( msize * msize ) && cnt > 50 ) | |
| 3103 { | |
| 3104 useDP = true ; | |
| 3105 break ; | |
| 3106 } | |
| 3107 } | |
| 3108 } | |
| 3109 | |
| 3110 int atCnt = cnt ; | |
| 3111 printf( "%d: atCnt=%d seCnt=%d %d %d %d\n", subexons[0].start + 1, atCnt, seCnt, useDP, (int)constraints[0].constraints.size(), (int)constraints[0].matePairs.size() ) ; | |
| 3112 fflush( stdout ) ; | |
| 3113 std::vector<struct _transcript> alltranscripts ; | |
| 3114 | |
| 3115 if ( !useDP ) | |
| 3116 { | |
| 3117 int origSize = atCnt ; | |
| 3118 alltranscripts.resize( atCnt ) ; | |
| 3119 for ( i = 0 ; i < atCnt ; ++i ) | |
| 3120 { | |
| 3121 alltranscripts[i].seVector.Init( seCnt ) ; | |
| 3122 alltranscripts[i].correlationScore = 1 ; | |
| 3123 } | |
| 3124 | |
| 3125 atCnt = 0 ; | |
| 3126 for ( i = 0 ; i < seCnt ; ++i ) | |
| 3127 { | |
| 3128 if ( subexons[i].canBeStart ) | |
| 3129 EnumerateTranscript( i, 0, f, 0, subexons, subexonCorrelation, 1, alltranscripts, atCnt ) ; | |
| 3130 } | |
| 3131 | |
| 3132 for ( i = atCnt ; i < origSize ; ++i ) | |
| 3133 alltranscripts[i].seVector.Release() ; | |
| 3134 | |
| 3135 alltranscripts.resize( atCnt ) ; | |
| 3136 //printf( "transcript cnt: %d\n", atCnt ) ; | |
| 3137 //printf( "%d %d\n", alltranscripts[0].seVector.Test( 1 ), constraints[0].matePairs.size() ) ; | |
| 3138 } | |
| 3139 else // Use dynamic programming to pick a set of candidate transcript. | |
| 3140 { | |
| 3141 std::vector<struct _transcript> sampleTranscripts ; | |
| 3142 | |
| 3143 // pre allocate the memory. | |
| 3144 struct _dpAttribute attr ; | |
| 3145 attr.f1 = new struct _dp[seCnt] ; | |
| 3146 if ( seCnt <= 10000 ) | |
| 3147 { | |
| 3148 attr.f2 = new struct _dp*[seCnt] ; | |
| 3149 for ( i = 0 ; i < seCnt ; ++i ) | |
| 3150 attr.f2[i] = new struct _dp[seCnt] ; | |
| 3151 } | |
| 3152 else | |
| 3153 attr.f2 = NULL ; | |
| 3154 | |
| 3155 hashMax = HASH_MAX ; | |
| 3156 if (seCnt > 500) | |
| 3157 hashMax = 1000003 ; | |
| 3158 else if (seCnt > 1000) | |
| 3159 hashMax = 10000019 ; | |
| 3160 else if (seCnt > 1500) | |
| 3161 hashMax = 20000003 ; | |
| 3162 | |
| 3163 attr.hash = dpHash ; | |
| 3164 if ( hashMax != HASH_MAX ) | |
| 3165 attr.hash = new struct _dp[hashMax] ; | |
| 3166 | |
| 3167 for ( i = 0 ; i < seCnt ; ++i ) | |
| 3168 { | |
| 3169 attr.f1[i].seVector.Nullify() ; | |
| 3170 attr.f1[i].seVector.Init( seCnt ) ; | |
| 3171 for ( j = i ; j < seCnt ; ++j ) | |
| 3172 { | |
| 3173 attr.f2[i][j].seVector.Nullify() ; | |
| 3174 attr.f2[i][j].seVector.Init( seCnt ) ; | |
| 3175 } | |
| 3176 } | |
| 3177 for ( i = 0 ; i < hashMax ; ++i ) | |
| 3178 { | |
| 3179 attr.hash[i].seVector.Nullify() ; | |
| 3180 attr.hash[i].seVector.Init( seCnt ) ; | |
| 3181 } | |
| 3182 | |
| 3183 // select candidate transcripts from each sample. | |
| 3184 struct _pair32 *sampleComplexity = new struct _pair32[ sampleCnt ] ; | |
| 3185 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3186 { | |
| 3187 sampleComplexity[i].a = i ; | |
| 3188 sampleComplexity[i].b = constraints[i].constraints.size() ; | |
| 3189 } | |
| 3190 qsort( sampleComplexity, sampleCnt, sizeof( sampleComplexity[0] ), CompPairsByB ) ; | |
| 3191 int downsampleCnt = -1 ; | |
| 3192 | |
| 3193 for ( i = sampleCnt - 1 ; i >= 0 ; --i ) | |
| 3194 { | |
| 3195 sampleTranscripts.clear() ; | |
| 3196 int iterBound = constraints[ sampleComplexity[i].a ].constraints.size() ; | |
| 3197 if ( i < sampleCnt - 1 ) | |
| 3198 iterBound = 100 ; | |
| 3199 | |
| 3200 if ( i < sampleCnt - 10 && alltranscripts.size() > 1000 ) | |
| 3201 iterBound = 10 ; | |
| 3202 //printf( "%d %d: %d %d %d %d\n", subexons[0].start + 1, sampleComplexity[i].a, constraints[ sampleComplexity[i].a ].constraints.size(), constraints[ sampleComplexity[i].a ].matePairs.size(), | |
| 3203 // alltranscripts.size(), iterBound ) ; fflush( stdout ) ; | |
| 3204 if ( maxDpConstraintSize > 0 ) | |
| 3205 { | |
| 3206 Constraints truncatedConstraints ; | |
| 3207 truncatedConstraints.TruncateConstraintsCoverFrom( constraints[ sampleComplexity[i].a ], seCnt, maxDpConstraintSize ) ; | |
| 3208 PickTranscriptsByDP( subexons, seCnt, iterBound, truncatedConstraints, | |
| 3209 subexonCorrelation, attr, sampleTranscripts ) ; | |
| 3210 } | |
| 3211 else if ( ( constraints[ sampleComplexity[i].a ].constraints.size() > 1000 | |
| 3212 && constraints[ sampleComplexity[i].a ].constraints.size() * 10 < constraints[ sampleComplexity[i].a ].matePairs.size() ) | |
| 3213 || ( downsampleCnt > 0 && (int)constraints[ sampleComplexity[i].a ].constraints.size() >= downsampleCnt ) | |
| 3214 || seCnt >= 1500 ) | |
| 3215 { | |
| 3216 Constraints downsampledConstraints ; | |
| 3217 int stride = (int)constraints[ sampleComplexity[i].a ].matePairs.size() / (int)constraints[ sampleComplexity[i].a ].constraints.size() ; | |
| 3218 if ( downsampleCnt > 0 ) | |
| 3219 stride = (int)constraints[ sampleComplexity[i].a ].constraints.size() / downsampleCnt ; | |
| 3220 if ( stride < 1 ) | |
| 3221 stride = 1 ; | |
| 3222 downsampledConstraints.DownsampleConstraintsFrom( constraints[ sampleComplexity[i].a ], stride ) ; | |
| 3223 if ( downsampleCnt <= 0 ) | |
| 3224 downsampleCnt = downsampledConstraints.constraints.size() ; | |
| 3225 if ( iterBound <= 10 ) | |
| 3226 continue ; | |
| 3227 PickTranscriptsByDP( subexons, seCnt, iterBound, downsampledConstraints, subexonCorrelation, attr, sampleTranscripts ) ; | |
| 3228 } | |
| 3229 else | |
| 3230 { | |
| 3231 PickTranscriptsByDP( subexons, seCnt, iterBound, constraints[ sampleComplexity[i].a ], subexonCorrelation, attr, sampleTranscripts ) ; | |
| 3232 } | |
| 3233 int size = sampleTranscripts.size() ; | |
| 3234 for ( j = 0 ; j < size ; ++j ) | |
| 3235 alltranscripts.push_back( sampleTranscripts[j] ) ; | |
| 3236 | |
| 3237 // we can further pick a smaller subsets of transcripts here if the number is still to big. | |
| 3238 CoalesceSameTranscripts( alltranscripts ) ; | |
| 3239 | |
| 3240 AugmentTranscripts( subexons, alltranscripts, 1000, false ) ; | |
| 3241 } | |
| 3242 | |
| 3243 // release the memory. | |
| 3244 delete[] sampleComplexity ; | |
| 3245 for ( i = 0 ; i < seCnt ; ++i ) | |
| 3246 { | |
| 3247 attr.f1[i].seVector.Release() ; | |
| 3248 for ( j = i ; j < seCnt && attr.f2 ; ++j ) | |
| 3249 attr.f2[i][j].seVector.Release() ; | |
| 3250 } | |
| 3251 for ( i = 0 ; i < hashMax ; ++i ) | |
| 3252 attr.hash[i].seVector.Release() ; | |
| 3253 | |
| 3254 delete[] attr.f1 ; | |
| 3255 for ( i = 0 ; i < seCnt && attr.f2 ; ++i ) | |
| 3256 delete[] attr.f2[i] ; | |
| 3257 delete[] attr.f2 ; | |
| 3258 if (hashMax != HASH_MAX) | |
| 3259 delete[] attr.hash ; | |
| 3260 | |
| 3261 } | |
| 3262 | |
| 3263 transcriptId = new int[usedGeneId - baseGeneId] ; | |
| 3264 std::vector<struct _transcript> *predTranscripts = new std::vector<struct _transcript>[sampleCnt] ; | |
| 3265 | |
| 3266 atCnt = alltranscripts.size() ; | |
| 3267 for ( i = 0 ; i < atCnt ; ++i ) | |
| 3268 alltranscripts[i].FPKM = 0 ; | |
| 3269 | |
| 3270 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3271 { | |
| 3272 int size = alltranscripts.size() ; | |
| 3273 for ( j = 0 ; j < size ; ++j ) | |
| 3274 alltranscripts[j].abundance = -1 ; | |
| 3275 //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ; | |
| 3276 PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ; | |
| 3277 | |
| 3278 /*double tmp = FPKMFraction ; | |
| 3279 FPKMFraction = 0 ; | |
| 3280 size = predTranscripts.size() ; | |
| 3281 for ( j = 0 ; j < size ; ++j ) | |
| 3282 { | |
| 3283 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ; | |
| 3284 } | |
| 3285 RefineTranscripts( subexons, seCnt, predTranscripts, constraints[i] ) ; | |
| 3286 FPKMFraction = tmp ;*/ | |
| 3287 | |
| 3288 } | |
| 3289 | |
| 3290 atCnt = alltranscripts.size() ; | |
| 3291 int *txptSampleSupport = new int[atCnt] ; | |
| 3292 memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ; | |
| 3293 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3294 { | |
| 3295 int size = predTranscripts[i].size() ; | |
| 3296 for ( j = 0 ; j < size ; ++j ) | |
| 3297 { | |
| 3298 ++txptSampleSupport[ predTranscripts[i][j].id ] ; | |
| 3299 ++alltranscripts[ predTranscripts[i][j].id ].FPKM ; | |
| 3300 } | |
| 3301 } | |
| 3302 | |
| 3303 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3304 { | |
| 3305 int size = alltranscripts.size() ; | |
| 3306 for ( j = 0 ; j < size ; ++j ) | |
| 3307 alltranscripts[j].abundance = -1 ; | |
| 3308 //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ; | |
| 3309 | |
| 3310 size = predTranscripts[i].size() ; | |
| 3311 for ( j = 0 ; j < size ; ++j ) | |
| 3312 { | |
| 3313 predTranscripts[i][j].seVector.Release() ; | |
| 3314 } | |
| 3315 predTranscripts[i].clear() ; | |
| 3316 PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ; | |
| 3317 } | |
| 3318 | |
| 3319 std::vector<int> *rawPredTranscriptIds = new std::vector<int>[sampleCnt] ; | |
| 3320 std::vector<double> *rawPredTranscriptAbundance = new std::vector<double>[sampleCnt] ; | |
| 3321 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3322 { | |
| 3323 int size = predTranscripts[i].size() ; | |
| 3324 | |
| 3325 for ( j = 0 ; j < size ; ++j ) | |
| 3326 { | |
| 3327 rawPredTranscriptIds[i].push_back( predTranscripts[i][j].id ) ; | |
| 3328 rawPredTranscriptAbundance[i].push_back( predTranscripts[i][j].abundance ) ; | |
| 3329 } | |
| 3330 } | |
| 3331 | |
| 3332 // Do the filtration. | |
| 3333 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3334 { | |
| 3335 int size = predTranscripts[i].size() ; | |
| 3336 for ( j = 0 ; j < size ; ++j ) | |
| 3337 { | |
| 3338 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; | |
| 3339 } | |
| 3340 size = RefineTranscripts( subexons, seCnt, false, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ; | |
| 3341 | |
| 3342 // Recompute the abundance. | |
| 3343 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; | |
| 3344 for ( j = 0 ; j < size ; ++j ) | |
| 3345 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; | |
| 3346 size = RefineTranscripts( subexons, seCnt, true, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ; | |
| 3347 | |
| 3348 //ComputeTranscriptsScore( subexons, seCnt, subexonChainSupport, predTranscripts[i] ) ; | |
| 3349 } | |
| 3350 | |
| 3351 // Rescue some filtered transcripts | |
| 3352 memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ; | |
| 3353 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3354 { | |
| 3355 int size = predTranscripts[i].size() ; | |
| 3356 for ( j = 0 ; j < size ; ++j ) | |
| 3357 { | |
| 3358 ++txptSampleSupport[ predTranscripts[i][j].id ] ; | |
| 3359 } | |
| 3360 } | |
| 3361 | |
| 3362 bool *predicted = new bool[atCnt] ; | |
| 3363 for ( i = 0 ; i < sampleCnt ; ++i ) | |
| 3364 { | |
| 3365 memset( predicted, false, sizeof( bool ) * atCnt ) ; | |
| 3366 if ( predTranscripts[i].size() != rawPredTranscriptIds[i].size() ) | |
| 3367 { | |
| 3368 int psize = predTranscripts[i].size() ; | |
| 3369 int rsize = rawPredTranscriptIds[i].size() ; | |
| 3370 int tcCnt = constraints[i].matePairs.size() ; | |
| 3371 | |
| 3372 for ( j = 0 ; j < psize ; ++j ) | |
| 3373 predicted[ predTranscripts[i][j].id ] = true ; | |
| 3374 | |
| 3375 for ( j = 0 ; j < rsize ; ++j ) | |
| 3376 { | |
| 3377 int id = rawPredTranscriptIds[i][j] ; | |
| 3378 if ( predicted[ id ] == false && | |
| 3379 ( txptSampleSupport[ id ] >= 3 && txptSampleSupport[id] >= 0.25 * sampleCnt ) ) | |
| 3380 { | |
| 3381 struct _transcript nt = alltranscripts[id] ; | |
| 3382 nt.seVector.Nullify() ; | |
| 3383 nt.seVector.Duplicate( alltranscripts[id].seVector ) ; | |
| 3384 nt.constraintsSupport = NULL ; | |
| 3385 nt.correlationScore = -1 ; | |
| 3386 nt.abundance = rawPredTranscriptAbundance[i][j] ; | |
| 3387 nt.id = id ; | |
| 3388 predTranscripts[i].push_back( nt ) ; | |
| 3389 } | |
| 3390 } | |
| 3391 if ( psize != predTranscripts[i].size() ) | |
| 3392 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; | |
| 3393 } | |
| 3394 | |
| 3395 int size = predTranscripts[i].size() ; | |
| 3396 | |
| 3397 if ( 0 ) //size == 1 ) | |
| 3398 { | |
| 3399 //AugmentTranscripts( subexons, predTranscripts[i], false ) ; | |
| 3400 | |
| 3401 int l = predTranscripts[i].size() ; | |
| 3402 int tcCnt = constraints[i].matePairs.size() ; | |
| 3403 for ( j = 0 ; j < l ; ++j ) | |
| 3404 { | |
| 3405 predTranscripts[i][j].abundance = 1.0 / alignments.readLen ; | |
| 3406 } | |
| 3407 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; | |
| 3408 | |
| 3409 std::vector<int> subexonIdx ; | |
| 3410 for ( j = 0 ; j < l ; ++j ) | |
| 3411 { | |
| 3412 subexonIdx.clear() ; | |
| 3413 predTranscripts[i][j].seVector.GetOnesIndices( subexonIdx ) ; | |
| 3414 int subexonIdxCnt = subexonIdx.size() ; | |
| 3415 int len = 0 ; | |
| 3416 for ( k = 0 ; k < subexonIdxCnt ; ++k ) | |
| 3417 len += subexons[ subexonIdx[k] ].end - subexons[ subexonIdx[k] ].start + 1 ; | |
| 3418 | |
| 3419 if ( predTranscripts[i][j].abundance * alignments.readLen / len < 2.0 ) | |
| 3420 predTranscripts[i][j].abundance = -1 ; | |
| 3421 else | |
| 3422 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; | |
| 3423 | |
| 3424 } | |
| 3425 RemoveNegativeAbundTranscripts( predTranscripts[i] ) ; | |
| 3426 } | |
| 3427 | |
| 3428 // Output | |
| 3429 size = predTranscripts[i].size() ; | |
| 3430 InitTranscriptId() ; | |
| 3431 for ( j = 0 ; j < size ; ++j ) | |
| 3432 { | |
| 3433 OutputTranscript( i, subexons, predTranscripts[i][j] ) ; | |
| 3434 } | |
| 3435 for ( j = 0 ; j < size ; ++j ) | |
| 3436 { | |
| 3437 predTranscripts[i][j].seVector.Release() ; | |
| 3438 } | |
| 3439 } | |
| 3440 | |
| 3441 delete []predicted ; | |
| 3442 delete []transcriptId ; | |
| 3443 delete []predTranscripts ; | |
| 3444 delete []rawPredTranscriptIds ; | |
| 3445 delete []rawPredTranscriptAbundance ; | |
| 3446 delete []txptSampleSupport ; | |
| 3447 | |
| 3448 atCnt = alltranscripts.size() ; | |
| 3449 for ( i = 0 ; i < atCnt ; ++i ) | |
| 3450 alltranscripts[i].seVector.Release() ; | |
| 3451 compatibleTestVectorT.Release() ; | |
| 3452 compatibleTestVectorC.Release() ; | |
| 3453 delete[] f ; | |
| 3454 delete[] subexonChainSupport ; | |
| 3455 return 0 ; | |
| 3456 } | |
| 3457 | |
| 3458 void *TranscriptDeciderSolve_Wrapper( void *a ) | |
| 3459 { | |
| 3460 int i ; | |
| 3461 | |
| 3462 struct _transcriptDeciderThreadArg &arg = *( (struct _transcriptDeciderThreadArg *)a ) ; | |
| 3463 TranscriptDecider transcriptDecider( arg.FPKMFraction, arg.classifierThreshold, arg.txptMinReadDepth, arg.sampleCnt, *( arg.alignments ) ) ; | |
| 3464 transcriptDecider.SetNumThreads( arg.numThreads + 1 ) ; | |
| 3465 transcriptDecider.SetMultiThreadOutputHandler( arg.outputHandler ) ; | |
| 3466 transcriptDecider.SetMaxDpConstraintSize( arg.maxDpConstraintSize ) ; | |
| 3467 transcriptDecider.Solve( arg.subexons, arg.seCnt, arg.constraints, arg.subexonCorrelation ) ; | |
| 3468 | |
| 3469 int start = arg.subexons[0].start ; | |
| 3470 int end = arg.subexons[ arg.seCnt - 1 ].end ; | |
| 3471 int chrId = arg.subexons[0].chrId ; | |
| 3472 // Release memory | |
| 3473 for ( i = 0 ; i < arg.seCnt ; ++i ) | |
| 3474 { | |
| 3475 delete[] arg.subexons[i].prev ; | |
| 3476 delete[] arg.subexons[i].next ; | |
| 3477 } | |
| 3478 delete[] arg.subexons ; | |
| 3479 | |
| 3480 // Put the work id back to the free threads queue. | |
| 3481 pthread_mutex_lock( arg.ftLock ) ; | |
| 3482 arg.freeThreads[ *( arg.ftCnt ) ] = arg.tid ; | |
| 3483 ++*( arg.ftCnt ) ; | |
| 3484 if ( *( arg.ftCnt ) == 1 ) | |
| 3485 pthread_cond_signal( arg.fullWorkCond ) ; | |
| 3486 pthread_mutex_unlock( arg.ftLock) ; | |
| 3487 printf( "Thread %d: %s %d %d finished.\n", arg.tid, arg.alignments->GetChromName(chrId), start + 1, end + 1 ) ; | |
| 3488 fflush( stdout ) ; | |
| 3489 | |
| 3490 | |
| 3491 pthread_exit( NULL ) ; | |
| 3492 } |
